Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Zones
     29  *
     30  *   A zone is a named collection of processes, namespace constraints,
     31  *   and other system resources which comprise a secure and manageable
     32  *   application containment facility.
     33  *
     34  *   Zones (represented by the reference counted zone_t) are tracked in
     35  *   the kernel in the zonehash.  Elsewhere in the kernel, Zone IDs
     36  *   (zoneid_t) are used to track zone association.  Zone IDs are
     37  *   dynamically generated when the zone is created; if a persistent
     38  *   identifier is needed (core files, accounting logs, audit trail,
     39  *   etc.), the zone name should be used.
     40  *
     41  *
     42  *   Global Zone:
     43  *
     44  *   The global zone (zoneid 0) is automatically associated with all
     45  *   system resources that have not been bound to a user-created zone.
     46  *   This means that even systems where zones are not in active use
     47  *   have a global zone, and all processes, mounts, etc. are
     48  *   associated with that zone.  The global zone is generally
     49  *   unconstrained in terms of privileges and access, though the usual
     50  *   credential and privilege based restrictions apply.
     51  *
     52  *
     53  *   Zone States:
     54  *
     55  *   The states in which a zone may be in and the transitions are as
     56  *   follows:
     57  *
     58  *   ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially
     59  *   initialized zone is added to the list of active zones on the system but
     60  *   isn't accessible.
     61  *
     62  *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
     63  *   not yet completed. Not possible to enter the zone, but attributes can
     64  *   be retrieved.
     65  *
     66  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
     67  *   ready.  The zone is made visible after the ZSD constructor callbacks are
     68  *   executed.  A zone remains in this state until it transitions into
     69  *   the ZONE_IS_BOOTING state as a result of a call to zone_boot().
     70  *
     71  *   ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start
     72  *   init.  Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN
     73  *   state.
     74  *
     75  *   ZONE_IS_RUNNING: The zone is open for business: zsched has
     76  *   successfully started init.   A zone remains in this state until
     77  *   zone_shutdown() is called.
     78  *
     79  *   ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is
     80  *   killing all processes running in the zone. The zone remains
     81  *   in this state until there are no more user processes running in the zone.
     82  *   zone_create(), zone_enter(), and zone_destroy() on this zone will fail.
     83  *   Since zone_shutdown() is restartable, it may be called successfully
     84  *   multiple times for the same zone_t.  Setting of the zone's state to
     85  *   ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check
     86  *   the zone's status without worrying about it being a moving target.
     87  *
     88  *   ZONE_IS_EMPTY: zone_shutdown() has been called, and there
     89  *   are no more user processes in the zone.  The zone remains in this
     90  *   state until there are no more kernel threads associated with the
     91  *   zone.  zone_create(), zone_enter(), and zone_destroy() on this zone will
     92  *   fail.
     93  *
     94  *   ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone
     95  *   have exited.  zone_shutdown() returns.  Henceforth it is not possible to
     96  *   join the zone or create kernel threads therein.
     97  *
     98  *   ZONE_IS_DYING: zone_destroy() has been called on the zone; zone
     99  *   remains in this state until zsched exits.  Calls to zone_find_by_*()
    100  *   return NULL from now on.
    101  *
    102  *   ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0).  There are no
    103  *   processes or threads doing work on behalf of the zone.  The zone is
    104  *   removed from the list of active zones.  zone_destroy() returns, and
    105  *   the zone can be recreated.
    106  *
    107  *   ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor
    108  *   callbacks are executed, and all memory associated with the zone is
    109  *   freed.
    110  *
    111  *   Threads can wait for the zone to enter a requested state by using
    112  *   zone_status_wait() or zone_status_timedwait() with the desired
    113  *   state passed in as an argument.  Zone state transitions are
    114  *   uni-directional; it is not possible to move back to an earlier state.
    115  *
    116  *
    117  *   Zone-Specific Data:
    118  *
    119  *   Subsystems needing to maintain zone-specific data can store that
    120  *   data using the ZSD mechanism.  This provides a zone-specific data
    121  *   store, similar to thread-specific data (see pthread_getspecific(3C)
    122  *   or the TSD code in uts/common/disp/thread.c.  Also, ZSD can be used
    123  *   to register callbacks to be invoked when a zone is created, shut
    124  *   down, or destroyed.  This can be used to initialize zone-specific
    125  *   data for new zones and to clean up when zones go away.
    126  *
    127  *
    128  *   Data Structures:
    129  *
    130  *   The per-zone structure (zone_t) is reference counted, and freed
    131  *   when all references are released.  zone_hold and zone_rele can be
    132  *   used to adjust the reference count.  In addition, reference counts
    133  *   associated with the cred_t structure are tracked separately using
    134  *   zone_cred_hold and zone_cred_rele.
    135  *
    136  *   Pointers to active zone_t's are stored in two hash tables; one
    137  *   for searching by id, the other for searching by name.  Lookups
    138  *   can be performed on either basis, using zone_find_by_id and
    139  *   zone_find_by_name.  Both return zone_t pointers with the zone
    140  *   held, so zone_rele should be called when the pointer is no longer
    141  *   needed.  Zones can also be searched by path; zone_find_by_path
    142  *   returns the zone with which a path name is associated (global
    143  *   zone if the path is not within some other zone's file system
    144  *   hierarchy).  This currently requires iterating through each zone,
    145  *   so it is slower than an id or name search via a hash table.
    146  *
    147  *
    148  *   Locking:
    149  *
    150  *   zonehash_lock: This is a top-level global lock used to protect the
    151  *       zone hash tables and lists.  Zones cannot be created or destroyed
    152  *       while this lock is held.
    153  *   zone_status_lock: This is a global lock protecting zone state.
    154  *       Zones cannot change state while this lock is held.  It also
    155  *       protects the list of kernel threads associated with a zone.
    156  *   zone_lock: This is a per-zone lock used to protect several fields of
    157  *       the zone_t (see <sys/zone.h> for details).  In addition, holding
    158  *       this lock means that the zone cannot go away.
    159  *   zone_nlwps_lock: This is a per-zone lock used to protect the fields
    160  *	 related to the zone.max-lwps rctl.
    161  *   zone_mem_lock: This is a per-zone lock used to protect the fields
    162  *	 related to the zone.max-locked-memory and zone.max-swap rctls.
    163  *   zsd_key_lock: This is a global lock protecting the key state for ZSD.
    164  *   zone_deathrow_lock: This is a global lock protecting the "deathrow"
    165  *       list (a list of zones in the ZONE_IS_DEAD state).
    166  *
    167  *   Ordering requirements:
    168  *       pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
    169  *       	zone_lock --> zsd_key_lock --> pidlock --> p_lock
    170  *
    171  *   When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
    172  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
    173  *	zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
    174  *
    175  *   Blocking memory allocations are permitted while holding any of the
    176  *   zone locks.
    177  *
    178  *
    179  *   System Call Interface:
    180  *
    181  *   The zone subsystem can be managed and queried from user level with
    182  *   the following system calls (all subcodes of the primary "zone"
    183  *   system call):
    184  *   - zone_create: creates a zone with selected attributes (name,
    185  *     root path, privileges, resource controls, ZFS datasets)
    186  *   - zone_enter: allows the current process to enter a zone
    187  *   - zone_getattr: reports attributes of a zone
    188  *   - zone_setattr: set attributes of a zone
    189  *   - zone_boot: set 'init' running for the zone
    190  *   - zone_list: lists all zones active in the system
    191  *   - zone_lookup: looks up zone id based on name
    192  *   - zone_shutdown: initiates shutdown process (see states above)
    193  *   - zone_destroy: completes shutdown process (see states above)
    194  *
    195  */
    196 
    197 #include <sys/priv_impl.h>
    198 #include <sys/cred.h>
    199 #include <c2/audit.h>
    200 #include <sys/debug.h>
    201 #include <sys/file.h>
    202 #include <sys/kmem.h>
    203 #include <sys/kstat.h>
    204 #include <sys/mutex.h>
    205 #include <sys/note.h>
    206 #include <sys/pathname.h>
    207 #include <sys/proc.h>
    208 #include <sys/project.h>
    209 #include <sys/sysevent.h>
    210 #include <sys/task.h>
    211 #include <sys/systm.h>
    212 #include <sys/types.h>
    213 #include <sys/utsname.h>
    214 #include <sys/vnode.h>
    215 #include <sys/vfs.h>
    216 #include <sys/systeminfo.h>
    217 #include <sys/policy.h>
    218 #include <sys/cred_impl.h>
    219 #include <sys/contract_impl.h>
    220 #include <sys/contract/process_impl.h>
    221 #include <sys/class.h>
    222 #include <sys/pool.h>
    223 #include <sys/pool_pset.h>
    224 #include <sys/pset.h>
    225 #include <sys/sysmacros.h>
    226 #include <sys/callb.h>
    227 #include <sys/vmparam.h>
    228 #include <sys/corectl.h>
    229 #include <sys/ipc_impl.h>
    230 
    231 #include <sys/door.h>
    232 #include <sys/cpuvar.h>
    233 #include <sys/sdt.h>
    234 
    235 #include <sys/uadmin.h>
    236 #include <sys/session.h>
    237 #include <sys/cmn_err.h>
    238 #include <sys/modhash.h>
    239 #include <sys/sunddi.h>
    240 #include <sys/nvpair.h>
    241 #include <sys/rctl.h>
    242 #include <sys/fss.h>
    243 #include <sys/brand.h>
    244 #include <sys/zone.h>
    245 #include <net/if.h>
    246 #include <sys/cpucaps.h>
    247 #include <vm/seg.h>
    248 #include <sys/mac.h>
    249 
    250 /* List of data link IDs which are accessible from the zone */
    251 typedef struct zone_dl {
    252 	datalink_id_t	zdl_id;
    253 	list_node_t	zdl_linkage;
    254 } zone_dl_t;
    255 
    256 /*
    257  * cv used to signal that all references to the zone have been released.  This
    258  * needs to be global since there may be multiple waiters, and the first to
    259  * wake up will free the zone_t, hence we cannot use zone->zone_cv.
    260  */
    261 static kcondvar_t zone_destroy_cv;
    262 /*
    263  * Lock used to serialize access to zone_cv.  This could have been per-zone,
    264  * but then we'd need another lock for zone_destroy_cv, and why bother?
    265  */
    266 static kmutex_t zone_status_lock;
    267 
    268 /*
    269  * ZSD-related global variables.
    270  */
    271 static kmutex_t zsd_key_lock;	/* protects the following two */
    272 /*
    273  * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval.
    274  */
    275 static zone_key_t zsd_keyval = 0;
    276 /*
    277  * Global list of registered keys.  We use this when a new zone is created.
    278  */
    279 static list_t zsd_registered_keys;
    280 
    281 int zone_hash_size = 256;
    282 static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel;
    283 static kmutex_t zonehash_lock;
    284 static uint_t zonecount;
    285 static id_space_t *zoneid_space;
    286 
    287 /*
    288  * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the
    289  * kernel proper runs, and which manages all other zones.
    290  *
    291  * Although not declared as static, the variable "zone0" should not be used
    292  * except for by code that needs to reference the global zone early on in boot,
    293  * before it is fully initialized.  All other consumers should use
    294  * 'global_zone'.
    295  */
    296 zone_t zone0;
    297 zone_t *global_zone = NULL;	/* Set when the global zone is initialized */
    298 
    299 /*
    300  * List of active zones, protected by zonehash_lock.
    301  */
    302 static list_t zone_active;
    303 
    304 /*
    305  * List of destroyed zones that still have outstanding cred references.
    306  * Used for debugging.  Uses a separate lock to avoid lock ordering
    307  * problems in zone_free.
    308  */
    309 static list_t zone_deathrow;
    310 static kmutex_t zone_deathrow_lock;
    311 
    312 /* number of zones is limited by virtual interface limit in IP */
    313 uint_t maxzones = 8192;
    314 
    315 /* Event channel to sent zone state change notifications */
    316 evchan_t *zone_event_chan;
    317 
    318 /*
    319  * This table holds the mapping from kernel zone states to
    320  * states visible in the state notification API.
    321  * The idea is that we only expose "obvious" states and
    322  * do not expose states which are just implementation details.
    323  */
    324 const char  *zone_status_table[] = {
    325 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
    326 	ZONE_EVENT_INITIALIZED,		/* initialized */
    327 	ZONE_EVENT_READY,		/* ready */
    328 	ZONE_EVENT_READY,		/* booting */
    329 	ZONE_EVENT_RUNNING,		/* running */
    330 	ZONE_EVENT_SHUTTING_DOWN,	/* shutting_down */
    331 	ZONE_EVENT_SHUTTING_DOWN,	/* empty */
    332 	ZONE_EVENT_SHUTTING_DOWN,	/* down */
    333 	ZONE_EVENT_SHUTTING_DOWN,	/* dying */
    334 	ZONE_EVENT_UNINITIALIZED,	/* dead */
    335 };
    336 
    337 /*
    338  * This isn't static so lint doesn't complain.
    339  */
    340 rctl_hndl_t rc_zone_cpu_shares;
    341 rctl_hndl_t rc_zone_locked_mem;
    342 rctl_hndl_t rc_zone_max_swap;
    343 rctl_hndl_t rc_zone_cpu_cap;
    344 rctl_hndl_t rc_zone_nlwps;
    345 rctl_hndl_t rc_zone_shmmax;
    346 rctl_hndl_t rc_zone_shmmni;
    347 rctl_hndl_t rc_zone_semmni;
    348 rctl_hndl_t rc_zone_msgmni;
    349 /*
    350  * Synchronization primitives used to synchronize between mounts and zone
    351  * creation/destruction.
    352  */
    353 static int mounts_in_progress;
    354 static kcondvar_t mount_cv;
    355 static kmutex_t mount_lock;
    356 
    357 const char * const zone_default_initname = "/sbin/init";
    358 static char * const zone_prefix = "/zone/";
    359 static int zone_shutdown(zoneid_t zoneid);
    360 static int zone_add_datalink(zoneid_t, datalink_id_t);
    361 static int zone_remove_datalink(zoneid_t, datalink_id_t);
    362 static int zone_list_datalink(zoneid_t, int *, datalink_id_t *);
    363 
    364 typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
    365 
    366 static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
    367 static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
    368 static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
    369 static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
    370     zone_key_t);
    371 static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
    372 static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
    373     kmutex_t *);
    374 static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
    375     kmutex_t *);
    376 
    377 /*
    378  * Bump this number when you alter the zone syscall interfaces; this is
    379  * because we need to have support for previous API versions in libc
    380  * to support patching; libc calls into the kernel to determine this number.
    381  *
    382  * Version 1 of the API is the version originally shipped with Solaris 10
    383  * Version 2 alters the zone_create system call in order to support more
    384  *     arguments by moving the args into a structure; and to do better
    385  *     error reporting when zone_create() fails.
    386  * Version 3 alters the zone_create system call in order to support the
    387  *     import of ZFS datasets to zones.
    388  * Version 4 alters the zone_create system call in order to support
    389  *     Trusted Extensions.
    390  * Version 5 alters the zone_boot system call, and converts its old
    391  *     bootargs parameter to be set by the zone_setattr API instead.
    392  * Version 6 adds the flag argument to zone_create.
    393  */
    394 static const int ZONE_SYSCALL_API_VERSION = 6;
    395 
    396 /*
    397  * Certain filesystems (such as NFS and autofs) need to know which zone
    398  * the mount is being placed in.  Because of this, we need to be able to
    399  * ensure that a zone isn't in the process of being created such that
    400  * nfs_mount() thinks it is in the global zone, while by the time it
    401  * gets added the list of mounted zones, it ends up on zoneA's mount
    402  * list.
    403  *
    404  * The following functions: block_mounts()/resume_mounts() and
    405  * mount_in_progress()/mount_completed() are used by zones and the VFS
    406  * layer (respectively) to synchronize zone creation and new mounts.
    407  *
    408  * The semantics are like a reader-reader lock such that there may
    409  * either be multiple mounts (or zone creations, if that weren't
    410  * serialized by zonehash_lock) in progress at the same time, but not
    411  * both.
    412  *
    413  * We use cv's so the user can ctrl-C out of the operation if it's
    414  * taking too long.
    415  *
    416  * The semantics are such that there is unfair bias towards the
    417  * "current" operation.  This means that zone creations may starve if
    418  * there is a rapid succession of new mounts coming in to the system, or
    419  * there is a remote possibility that zones will be created at such a
    420  * rate that new mounts will not be able to proceed.
    421  */
    422 /*
    423  * Prevent new mounts from progressing to the point of calling
    424  * VFS_MOUNT().  If there are already mounts in this "region", wait for
    425  * them to complete.
    426  */
    427 static int
    428 block_mounts(void)
    429 {
    430 	int retval = 0;
    431 
    432 	/*
    433 	 * Since it may block for a long time, block_mounts() shouldn't be
    434 	 * called with zonehash_lock held.
    435 	 */
    436 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
    437 	mutex_enter(&mount_lock);
    438 	while (mounts_in_progress > 0) {
    439 		if (cv_wait_sig(&mount_cv, &mount_lock) == 0)
    440 			goto signaled;
    441 	}
    442 	/*
    443 	 * A negative value of mounts_in_progress indicates that mounts
    444 	 * have been blocked by (-mounts_in_progress) different callers.
    445 	 */
    446 	mounts_in_progress--;
    447 	retval = 1;
    448 signaled:
    449 	mutex_exit(&mount_lock);
    450 	return (retval);
    451 }
    452 
    453 /*
    454  * The VFS layer may progress with new mounts as far as we're concerned.
    455  * Allow them to progress if we were the last obstacle.
    456  */
    457 static void
    458 resume_mounts(void)
    459 {
    460 	mutex_enter(&mount_lock);
    461 	if (++mounts_in_progress == 0)
    462 		cv_broadcast(&mount_cv);
    463 	mutex_exit(&mount_lock);
    464 }
    465 
    466 /*
    467  * The VFS layer is busy with a mount; zones should wait until all
    468  * mounts are completed to progress.
    469  */
    470 void
    471 mount_in_progress(void)
    472 {
    473 	mutex_enter(&mount_lock);
    474 	while (mounts_in_progress < 0)
    475 		cv_wait(&mount_cv, &mount_lock);
    476 	mounts_in_progress++;
    477 	mutex_exit(&mount_lock);
    478 }
    479 
    480 /*
    481  * VFS is done with one mount; wake up any waiting block_mounts()
    482  * callers if this is the last mount.
    483  */
    484 void
    485 mount_completed(void)
    486 {
    487 	mutex_enter(&mount_lock);
    488 	if (--mounts_in_progress == 0)
    489 		cv_broadcast(&mount_cv);
    490 	mutex_exit(&mount_lock);
    491 }
    492 
    493 /*
    494  * ZSD routines.
    495  *
    496  * Zone Specific Data (ZSD) is modeled after Thread Specific Data as
    497  * defined by the pthread_key_create() and related interfaces.
    498  *
    499  * Kernel subsystems may register one or more data items and/or
    500  * callbacks to be executed when a zone is created, shutdown, or
    501  * destroyed.
    502  *
    503  * Unlike the thread counterpart, destructor callbacks will be executed
    504  * even if the data pointer is NULL and/or there are no constructor
    505  * callbacks, so it is the responsibility of such callbacks to check for
    506  * NULL data values if necessary.
    507  *
    508  * The locking strategy and overall picture is as follows:
    509  *
    510  * When someone calls zone_key_create(), a template ZSD entry is added to the
    511  * global list "zsd_registered_keys", protected by zsd_key_lock.  While
    512  * holding that lock all the existing zones are marked as
    513  * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
    514  * zone_zsd list (protected by zone_lock). The global list is updated first
    515  * (under zone_key_lock) to make sure that newly created zones use the
    516  * most recent list of keys. Then under zonehash_lock we walk the zones
    517  * and mark them.  Similar locking is used in zone_key_delete().
    518  *
    519  * The actual create, shutdown, and destroy callbacks are done without
    520  * holding any lock. And zsd_flags are used to ensure that the operations
    521  * completed so that when zone_key_create (and zone_create) is done, as well as
    522  * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
    523  * are completed.
    524  *
    525  * When new zones are created constructor callbacks for all registered ZSD
    526  * entries will be called. That also uses the above two phases of marking
    527  * what needs to be done, and then running the callbacks without holding
    528  * any locks.
    529  *
    530  * The framework does not provide any locking around zone_getspecific() and
    531  * zone_setspecific() apart from that needed for internal consistency, so
    532  * callers interested in atomic "test-and-set" semantics will need to provide
    533  * their own locking.
    534  */
    535 
    536 /*
    537  * Helper function to find the zsd_entry associated with the key in the
    538  * given list.
    539  */
    540 static struct zsd_entry *
    541 zsd_find(list_t *l, zone_key_t key)
    542 {
    543 	struct zsd_entry *zsd;
    544 
    545 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
    546 		if (zsd->zsd_key == key) {
    547 			return (zsd);
    548 		}
    549 	}
    550 	return (NULL);
    551 }
    552 
    553 /*
    554  * Helper function to find the zsd_entry associated with the key in the
    555  * given list. Move it to the front of the list.
    556  */
    557 static struct zsd_entry *
    558 zsd_find_mru(list_t *l, zone_key_t key)
    559 {
    560 	struct zsd_entry *zsd;
    561 
    562 	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
    563 		if (zsd->zsd_key == key) {
    564 			/*
    565 			 * Move to head of list to keep list in MRU order.
    566 			 */
    567 			if (zsd != list_head(l)) {
    568 				list_remove(l, zsd);
    569 				list_insert_head(l, zsd);
    570 			}
    571 			return (zsd);
    572 		}
    573 	}
    574 	return (NULL);
    575 }
    576 
    577 void
    578 zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
    579     void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
    580 {
    581 	struct zsd_entry *zsdp;
    582 	struct zsd_entry *t;
    583 	struct zone *zone;
    584 	zone_key_t  key;
    585 
    586 	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
    587 	zsdp->zsd_data = NULL;
    588 	zsdp->zsd_create = create;
    589 	zsdp->zsd_shutdown = shutdown;
    590 	zsdp->zsd_destroy = destroy;
    591 
    592 	/*
    593 	 * Insert in global list of callbacks. Makes future zone creations
    594 	 * see it.
    595 	 */
    596 	mutex_enter(&zsd_key_lock);
    597 	key = zsdp->zsd_key = ++zsd_keyval;
    598 	ASSERT(zsd_keyval != 0);
    599 	list_insert_tail(&zsd_registered_keys, zsdp);
    600 	mutex_exit(&zsd_key_lock);
    601 
    602 	/*
    603 	 * Insert for all existing zones and mark them as needing
    604 	 * a create callback.
    605 	 */
    606 	mutex_enter(&zonehash_lock);	/* stop the world */
    607 	for (zone = list_head(&zone_active); zone != NULL;
    608 	    zone = list_next(&zone_active, zone)) {
    609 		zone_status_t status;
    610 
    611 		mutex_enter(&zone->zone_lock);
    612 
    613 		/* Skip zones that are on the way down or not yet up */
    614 		status = zone_status_get(zone);
    615 		if (status >= ZONE_IS_DOWN ||
    616 		    status == ZONE_IS_UNINITIALIZED) {
    617 			mutex_exit(&zone->zone_lock);
    618 			continue;
    619 		}
    620 
    621 		t = zsd_find_mru(&zone->zone_zsd, key);
    622 		if (t != NULL) {
    623 			/*
    624 			 * A zsd_configure already inserted it after
    625 			 * we dropped zsd_key_lock above.
    626 			 */
    627 			mutex_exit(&zone->zone_lock);
    628 			continue;
    629 		}
    630 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
    631 		t->zsd_key = key;
    632 		t->zsd_create = create;
    633 		t->zsd_shutdown = shutdown;
    634 		t->zsd_destroy = destroy;
    635 		if (create != NULL) {
    636 			t->zsd_flags = ZSD_CREATE_NEEDED;
    637 			DTRACE_PROBE2(zsd__create__needed,
    638 			    zone_t *, zone, zone_key_t, key);
    639 		}
    640 		list_insert_tail(&zone->zone_zsd, t);
    641 		mutex_exit(&zone->zone_lock);
    642 	}
    643 	mutex_exit(&zonehash_lock);
    644 
    645 	if (create != NULL) {
    646 		/* Now call the create callback for this key */
    647 		zsd_apply_all_zones(zsd_apply_create, key);
    648 	}
    649 	/*
    650 	 * It is safe for consumers to use the key now, make it
    651 	 * globally visible. Specifically zone_getspecific() will
    652 	 * always successfully return the zone specific data associated
    653 	 * with the key.
    654 	 */
    655 	*keyp = key;
    656 
    657 }
    658 
    659 /*
    660  * Function called when a module is being unloaded, or otherwise wishes
    661  * to unregister its ZSD key and callbacks.
    662  *
    663  * Remove from the global list and determine the functions that need to
    664  * be called under a global lock. Then call the functions without
    665  * holding any locks. Finally free up the zone_zsd entries. (The apply
    666  * functions need to access the zone_zsd entries to find zsd_data etc.)
    667  */
    668 int
    669 zone_key_delete(zone_key_t key)
    670 {
    671 	struct zsd_entry *zsdp = NULL;
    672 	zone_t *zone;
    673 
    674 	mutex_enter(&zsd_key_lock);
    675 	zsdp = zsd_find_mru(&zsd_registered_keys, key);
    676 	if (zsdp == NULL) {
    677 		mutex_exit(&zsd_key_lock);
    678 		return (-1);
    679 	}
    680 	list_remove(&zsd_registered_keys, zsdp);
    681 	mutex_exit(&zsd_key_lock);
    682 
    683 	mutex_enter(&zonehash_lock);
    684 	for (zone = list_head(&zone_active); zone != NULL;
    685 	    zone = list_next(&zone_active, zone)) {
    686 		struct zsd_entry *del;
    687 
    688 		mutex_enter(&zone->zone_lock);
    689 		del = zsd_find_mru(&zone->zone_zsd, key);
    690 		if (del == NULL) {
    691 			/*
    692 			 * Somebody else got here first e.g the zone going
    693 			 * away.
    694 			 */
    695 			mutex_exit(&zone->zone_lock);
    696 			continue;
    697 		}
    698 		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
    699 		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
    700 		if (del->zsd_shutdown != NULL &&
    701 		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
    702 			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
    703 			DTRACE_PROBE2(zsd__shutdown__needed,
    704 			    zone_t *, zone, zone_key_t, key);
    705 		}
    706 		if (del->zsd_destroy != NULL &&
    707 		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
    708 			del->zsd_flags |= ZSD_DESTROY_NEEDED;
    709 			DTRACE_PROBE2(zsd__destroy__needed,
    710 			    zone_t *, zone, zone_key_t, key);
    711 		}
    712 		mutex_exit(&zone->zone_lock);
    713 	}
    714 	mutex_exit(&zonehash_lock);
    715 	kmem_free(zsdp, sizeof (*zsdp));
    716 
    717 	/* Now call the shutdown and destroy callback for this key */
    718 	zsd_apply_all_zones(zsd_apply_shutdown, key);
    719 	zsd_apply_all_zones(zsd_apply_destroy, key);
    720 
    721 	/* Now we can free up the zsdp structures in each zone */
    722 	mutex_enter(&zonehash_lock);
    723 	for (zone = list_head(&zone_active); zone != NULL;
    724 	    zone = list_next(&zone_active, zone)) {
    725 		struct zsd_entry *del;
    726 
    727 		mutex_enter(&zone->zone_lock);
    728 		del = zsd_find(&zone->zone_zsd, key);
    729 		if (del != NULL) {
    730 			list_remove(&zone->zone_zsd, del);
    731 			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
    732 			kmem_free(del, sizeof (*del));
    733 		}
    734 		mutex_exit(&zone->zone_lock);
    735 	}
    736 	mutex_exit(&zonehash_lock);
    737 
    738 	return (0);
    739 }
    740 
    741 /*
    742  * ZSD counterpart of pthread_setspecific().
    743  *
    744  * Since all zsd callbacks, including those with no create function,
    745  * have an entry in zone_zsd, if the key is registered it is part of
    746  * the zone_zsd list.
    747  * Return an error if the key wasn't registerd.
    748  */
    749 int
    750 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
    751 {
    752 	struct zsd_entry *t;
    753 
    754 	mutex_enter(&zone->zone_lock);
    755 	t = zsd_find_mru(&zone->zone_zsd, key);
    756 	if (t != NULL) {
    757 		/*
    758 		 * Replace old value with new
    759 		 */
    760 		t->zsd_data = (void *)data;
    761 		mutex_exit(&zone->zone_lock);
    762 		return (0);
    763 	}
    764 	mutex_exit(&zone->zone_lock);
    765 	return (-1);
    766 }
    767 
    768 /*
    769  * ZSD counterpart of pthread_getspecific().
    770  */
    771 void *
    772 zone_getspecific(zone_key_t key, zone_t *zone)
    773 {
    774 	struct zsd_entry *t;
    775 	void *data;
    776 
    777 	mutex_enter(&zone->zone_lock);
    778 	t = zsd_find_mru(&zone->zone_zsd, key);
    779 	data = (t == NULL ? NULL : t->zsd_data);
    780 	mutex_exit(&zone->zone_lock);
    781 	return (data);
    782 }
    783 
    784 /*
    785  * Function used to initialize a zone's list of ZSD callbacks and data
    786  * when the zone is being created.  The callbacks are initialized from
    787  * the template list (zsd_registered_keys). The constructor callback is
    788  * executed later (once the zone exists and with locks dropped).
    789  */
    790 static void
    791 zone_zsd_configure(zone_t *zone)
    792 {
    793 	struct zsd_entry *zsdp;
    794 	struct zsd_entry *t;
    795 
    796 	ASSERT(MUTEX_HELD(&zonehash_lock));
    797 	ASSERT(list_head(&zone->zone_zsd) == NULL);
    798 	mutex_enter(&zone->zone_lock);
    799 	mutex_enter(&zsd_key_lock);
    800 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
    801 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
    802 		/*
    803 		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
    804 		 * should not have added anything to it.
    805 		 */
    806 		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
    807 
    808 		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
    809 		t->zsd_key = zsdp->zsd_key;
    810 		t->zsd_create = zsdp->zsd_create;
    811 		t->zsd_shutdown = zsdp->zsd_shutdown;
    812 		t->zsd_destroy = zsdp->zsd_destroy;
    813 		if (zsdp->zsd_create != NULL) {
    814 			t->zsd_flags = ZSD_CREATE_NEEDED;
    815 			DTRACE_PROBE2(zsd__create__needed,
    816 			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
    817 		}
    818 		list_insert_tail(&zone->zone_zsd, t);
    819 	}
    820 	mutex_exit(&zsd_key_lock);
    821 	mutex_exit(&zone->zone_lock);
    822 }
    823 
    824 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
    825 
    826 /*
    827  * Helper function to execute shutdown or destructor callbacks.
    828  */
    829 static void
    830 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
    831 {
    832 	struct zsd_entry *t;
    833 
    834 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
    835 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
    836 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
    837 
    838 	/*
    839 	 * Run the callback solely based on what is registered for the zone
    840 	 * in zone_zsd. The global list can change independently of this
    841 	 * as keys are registered and unregistered and we don't register new
    842 	 * callbacks for a zone that is in the process of going away.
    843 	 */
    844 	mutex_enter(&zone->zone_lock);
    845 	for (t = list_head(&zone->zone_zsd); t != NULL;
    846 	    t = list_next(&zone->zone_zsd, t)) {
    847 		zone_key_t key = t->zsd_key;
    848 
    849 		/* Skip if no callbacks registered */
    850 
    851 		if (ct == ZSD_SHUTDOWN) {
    852 			if (t->zsd_shutdown != NULL &&
    853 			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
    854 				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
    855 				DTRACE_PROBE2(zsd__shutdown__needed,
    856 				    zone_t *, zone, zone_key_t, key);
    857 			}
    858 		} else {
    859 			if (t->zsd_destroy != NULL &&
    860 			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
    861 				t->zsd_flags |= ZSD_DESTROY_NEEDED;
    862 				DTRACE_PROBE2(zsd__destroy__needed,
    863 				    zone_t *, zone, zone_key_t, key);
    864 			}
    865 		}
    866 	}
    867 	mutex_exit(&zone->zone_lock);
    868 
    869 	/* Now call the shutdown and destroy callback for this key */
    870 	zsd_apply_all_keys(zsd_apply_shutdown, zone);
    871 	zsd_apply_all_keys(zsd_apply_destroy, zone);
    872 
    873 }
    874 
    875 /*
    876  * Called when the zone is going away; free ZSD-related memory, and
    877  * destroy the zone_zsd list.
    878  */
    879 static void
    880 zone_free_zsd(zone_t *zone)
    881 {
    882 	struct zsd_entry *t, *next;
    883 
    884 	/*
    885 	 * Free all the zsd_entry's we had on this zone.
    886 	 */
    887 	mutex_enter(&zone->zone_lock);
    888 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
    889 		next = list_next(&zone->zone_zsd, t);
    890 		list_remove(&zone->zone_zsd, t);
    891 		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
    892 		kmem_free(t, sizeof (*t));
    893 	}
    894 	list_destroy(&zone->zone_zsd);
    895 	mutex_exit(&zone->zone_lock);
    896 
    897 }
    898 
    899 /*
    900  * Apply a function to all zones for particular key value.
    901  *
    902  * The applyfn has to drop zonehash_lock if it does some work, and
    903  * then reacquire it before it returns.
    904  * When the lock is dropped we don't follow list_next even
    905  * if it is possible to do so without any hazards. This is
    906  * because we want the design to allow for the list of zones
    907  * to change in any arbitrary way during the time the
    908  * lock was dropped.
    909  *
    910  * It is safe to restart the loop at list_head since the applyfn
    911  * changes the zsd_flags as it does work, so a subsequent
    912  * pass through will have no effect in applyfn, hence the loop will terminate
    913  * in at worst O(N^2).
    914  */
    915 static void
    916 zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
    917 {
    918 	zone_t *zone;
    919 
    920 	mutex_enter(&zonehash_lock);
    921 	zone = list_head(&zone_active);
    922 	while (zone != NULL) {
    923 		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
    924 			/* Lock dropped - restart at head */
    925 			zone = list_head(&zone_active);
    926 		} else {
    927 			zone = list_next(&zone_active, zone);
    928 		}
    929 	}
    930 	mutex_exit(&zonehash_lock);
    931 }
    932 
    933 /*
    934  * Apply a function to all keys for a particular zone.
    935  *
    936  * The applyfn has to drop zonehash_lock if it does some work, and
    937  * then reacquire it before it returns.
    938  * When the lock is dropped we don't follow list_next even
    939  * if it is possible to do so without any hazards. This is
    940  * because we want the design to allow for the list of zsd callbacks
    941  * to change in any arbitrary way during the time the
    942  * lock was dropped.
    943  *
    944  * It is safe to restart the loop at list_head since the applyfn
    945  * changes the zsd_flags as it does work, so a subsequent
    946  * pass through will have no effect in applyfn, hence the loop will terminate
    947  * in at worst O(N^2).
    948  */
    949 static void
    950 zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
    951 {
    952 	struct zsd_entry *t;
    953 
    954 	mutex_enter(&zone->zone_lock);
    955 	t = list_head(&zone->zone_zsd);
    956 	while (t != NULL) {
    957 		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
    958 			/* Lock dropped - restart at head */
    959 			t = list_head(&zone->zone_zsd);
    960 		} else {
    961 			t = list_next(&zone->zone_zsd, t);
    962 		}
    963 	}
    964 	mutex_exit(&zone->zone_lock);
    965 }
    966 
    967 /*
    968  * Call the create function for the zone and key if CREATE_NEEDED
    969  * is set.
    970  * If some other thread gets here first and sets CREATE_INPROGRESS, then
    971  * we wait for that thread to complete so that we can ensure that
    972  * all the callbacks are done when we've looped over all zones/keys.
    973  *
    974  * When we call the create function, we drop the global held by the
    975  * caller, and return true to tell the caller it needs to re-evalute the
    976  * state.
    977  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
    978  * remains held on exit.
    979  */
    980 static boolean_t
    981 zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
    982     zone_t *zone, zone_key_t key)
    983 {
    984 	void *result;
    985 	struct zsd_entry *t;
    986 	boolean_t dropped;
    987 
    988 	if (lockp != NULL) {
    989 		ASSERT(MUTEX_HELD(lockp));
    990 	}
    991 	if (zone_lock_held) {
    992 		ASSERT(MUTEX_HELD(&zone->zone_lock));
    993 	} else {
    994 		mutex_enter(&zone->zone_lock);
    995 	}
    996 
    997 	t = zsd_find(&zone->zone_zsd, key);
    998 	if (t == NULL) {
    999 		/*
   1000 		 * Somebody else got here first e.g the zone going
   1001 		 * away.
   1002 		 */
   1003 		if (!zone_lock_held)
   1004 			mutex_exit(&zone->zone_lock);
   1005 		return (B_FALSE);
   1006 	}
   1007 	dropped = B_FALSE;
   1008 	if (zsd_wait_for_inprogress(zone, t, lockp))
   1009 		dropped = B_TRUE;
   1010 
   1011 	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
   1012 		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
   1013 		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
   1014 		DTRACE_PROBE2(zsd__create__inprogress,
   1015 		    zone_t *, zone, zone_key_t, key);
   1016 		mutex_exit(&zone->zone_lock);
   1017 		if (lockp != NULL)
   1018 			mutex_exit(lockp);
   1019 
   1020 		dropped = B_TRUE;
   1021 		ASSERT(t->zsd_create != NULL);
   1022 		DTRACE_PROBE2(zsd__create__start,
   1023 		    zone_t *, zone, zone_key_t, key);
   1024 
   1025 		result = (*t->zsd_create)(zone->zone_id);
   1026 
   1027 		DTRACE_PROBE2(zsd__create__end,
   1028 		    zone_t *, zone, voidn *, result);
   1029 
   1030 		ASSERT(result != NULL);
   1031 		if (lockp != NULL)
   1032 			mutex_enter(lockp);
   1033 		mutex_enter(&zone->zone_lock);
   1034 		t->zsd_data = result;
   1035 		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
   1036 		t->zsd_flags |= ZSD_CREATE_COMPLETED;
   1037 		cv_broadcast(&t->zsd_cv);
   1038 		DTRACE_PROBE2(zsd__create__completed,
   1039 		    zone_t *, zone, zone_key_t, key);
   1040 	}
   1041 	if (!zone_lock_held)
   1042 		mutex_exit(&zone->zone_lock);
   1043 	return (dropped);
   1044 }
   1045 
   1046 /*
   1047  * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
   1048  * is set.
   1049  * If some other thread gets here first and sets *_INPROGRESS, then
   1050  * we wait for that thread to complete so that we can ensure that
   1051  * all the callbacks are done when we've looped over all zones/keys.
   1052  *
   1053  * When we call the shutdown function, we drop the global held by the
   1054  * caller, and return true to tell the caller it needs to re-evalute the
   1055  * state.
   1056  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
   1057  * remains held on exit.
   1058  */
   1059 static boolean_t
   1060 zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
   1061     zone_t *zone, zone_key_t key)
   1062 {
   1063 	struct zsd_entry *t;
   1064 	void *data;
   1065 	boolean_t dropped;
   1066 
   1067 	if (lockp != NULL) {
   1068 		ASSERT(MUTEX_HELD(lockp));
   1069 	}
   1070 	if (zone_lock_held) {
   1071 		ASSERT(MUTEX_HELD(&zone->zone_lock));
   1072 	} else {
   1073 		mutex_enter(&zone->zone_lock);
   1074 	}
   1075 
   1076 	t = zsd_find(&zone->zone_zsd, key);
   1077 	if (t == NULL) {
   1078 		/*
   1079 		 * Somebody else got here first e.g the zone going
   1080 		 * away.
   1081 		 */
   1082 		if (!zone_lock_held)
   1083 			mutex_exit(&zone->zone_lock);
   1084 		return (B_FALSE);
   1085 	}
   1086 	dropped = B_FALSE;
   1087 	if (zsd_wait_for_creator(zone, t, lockp))
   1088 		dropped = B_TRUE;
   1089 
   1090 	if (zsd_wait_for_inprogress(zone, t, lockp))
   1091 		dropped = B_TRUE;
   1092 
   1093 	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
   1094 		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
   1095 		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
   1096 		DTRACE_PROBE2(zsd__shutdown__inprogress,
   1097 		    zone_t *, zone, zone_key_t, key);
   1098 		mutex_exit(&zone->zone_lock);
   1099 		if (lockp != NULL)
   1100 			mutex_exit(lockp);
   1101 		dropped = B_TRUE;
   1102 
   1103 		ASSERT(t->zsd_shutdown != NULL);
   1104 		data = t->zsd_data;
   1105 
   1106 		DTRACE_PROBE2(zsd__shutdown__start,
   1107 		    zone_t *, zone, zone_key_t, key);
   1108 
   1109 		(t->zsd_shutdown)(zone->zone_id, data);
   1110 		DTRACE_PROBE2(zsd__shutdown__end,
   1111 		    zone_t *, zone, zone_key_t, key);
   1112 
   1113 		if (lockp != NULL)
   1114 			mutex_enter(lockp);
   1115 		mutex_enter(&zone->zone_lock);
   1116 		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
   1117 		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
   1118 		cv_broadcast(&t->zsd_cv);
   1119 		DTRACE_PROBE2(zsd__shutdown__completed,
   1120 		    zone_t *, zone, zone_key_t, key);
   1121 	}
   1122 	if (!zone_lock_held)
   1123 		mutex_exit(&zone->zone_lock);
   1124 	return (dropped);
   1125 }
   1126 
   1127 /*
   1128  * Call the destroy function for the zone and key if DESTROY_NEEDED
   1129  * is set.
   1130  * If some other thread gets here first and sets *_INPROGRESS, then
   1131  * we wait for that thread to complete so that we can ensure that
   1132  * all the callbacks are done when we've looped over all zones/keys.
   1133  *
   1134  * When we call the destroy function, we drop the global held by the
   1135  * caller, and return true to tell the caller it needs to re-evalute the
   1136  * state.
   1137  * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
   1138  * remains held on exit.
   1139  */
   1140 static boolean_t
   1141 zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
   1142     zone_t *zone, zone_key_t key)
   1143 {
   1144 	struct zsd_entry *t;
   1145 	void *data;
   1146 	boolean_t dropped;
   1147 
   1148 	if (lockp != NULL) {
   1149 		ASSERT(MUTEX_HELD(lockp));
   1150 	}
   1151 	if (zone_lock_held) {
   1152 		ASSERT(MUTEX_HELD(&zone->zone_lock));
   1153 	} else {
   1154 		mutex_enter(&zone->zone_lock);
   1155 	}
   1156 
   1157 	t = zsd_find(&zone->zone_zsd, key);
   1158 	if (t == NULL) {
   1159 		/*
   1160 		 * Somebody else got here first e.g the zone going
   1161 		 * away.
   1162 		 */
   1163 		if (!zone_lock_held)
   1164 			mutex_exit(&zone->zone_lock);
   1165 		return (B_FALSE);
   1166 	}
   1167 	dropped = B_FALSE;
   1168 	if (zsd_wait_for_creator(zone, t, lockp))
   1169 		dropped = B_TRUE;
   1170 
   1171 	if (zsd_wait_for_inprogress(zone, t, lockp))
   1172 		dropped = B_TRUE;
   1173 
   1174 	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
   1175 		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
   1176 		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
   1177 		DTRACE_PROBE2(zsd__destroy__inprogress,
   1178 		    zone_t *, zone, zone_key_t, key);
   1179 		mutex_exit(&zone->zone_lock);
   1180 		if (lockp != NULL)
   1181 			mutex_exit(lockp);
   1182 		dropped = B_TRUE;
   1183 
   1184 		ASSERT(t->zsd_destroy != NULL);
   1185 		data = t->zsd_data;
   1186 		DTRACE_PROBE2(zsd__destroy__start,
   1187 		    zone_t *, zone, zone_key_t, key);
   1188 
   1189 		(t->zsd_destroy)(zone->zone_id, data);
   1190 		DTRACE_PROBE2(zsd__destroy__end,
   1191 		    zone_t *, zone, zone_key_t, key);
   1192 
   1193 		if (lockp != NULL)
   1194 			mutex_enter(lockp);
   1195 		mutex_enter(&zone->zone_lock);
   1196 		t->zsd_data = NULL;
   1197 		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
   1198 		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
   1199 		cv_broadcast(&t->zsd_cv);
   1200 		DTRACE_PROBE2(zsd__destroy__completed,
   1201 		    zone_t *, zone, zone_key_t, key);
   1202 	}
   1203 	if (!zone_lock_held)
   1204 		mutex_exit(&zone->zone_lock);
   1205 	return (dropped);
   1206 }
   1207 
   1208 /*
   1209  * Wait for any CREATE_NEEDED flag to be cleared.
   1210  * Returns true if lockp was temporarily dropped while waiting.
   1211  */
   1212 static boolean_t
   1213 zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
   1214 {
   1215 	boolean_t dropped = B_FALSE;
   1216 
   1217 	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
   1218 		DTRACE_PROBE2(zsd__wait__for__creator,
   1219 		    zone_t *, zone, struct zsd_entry *, t);
   1220 		if (lockp != NULL) {
   1221 			dropped = B_TRUE;
   1222 			mutex_exit(lockp);
   1223 		}
   1224 		cv_wait(&t->zsd_cv, &zone->zone_lock);
   1225 		if (lockp != NULL) {
   1226 			/* First drop zone_lock to preserve order */
   1227 			mutex_exit(&zone->zone_lock);
   1228 			mutex_enter(lockp);
   1229 			mutex_enter(&zone->zone_lock);
   1230 		}
   1231 	}
   1232 	return (dropped);
   1233 }
   1234 
   1235 /*
   1236  * Wait for any INPROGRESS flag to be cleared.
   1237  * Returns true if lockp was temporarily dropped while waiting.
   1238  */
   1239 static boolean_t
   1240 zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
   1241 {
   1242 	boolean_t dropped = B_FALSE;
   1243 
   1244 	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
   1245 		DTRACE_PROBE2(zsd__wait__for__inprogress,
   1246 		    zone_t *, zone, struct zsd_entry *, t);
   1247 		if (lockp != NULL) {
   1248 			dropped = B_TRUE;
   1249 			mutex_exit(lockp);
   1250 		}
   1251 		cv_wait(&t->zsd_cv, &zone->zone_lock);
   1252 		if (lockp != NULL) {
   1253 			/* First drop zone_lock to preserve order */
   1254 			mutex_exit(&zone->zone_lock);
   1255 			mutex_enter(lockp);
   1256 			mutex_enter(&zone->zone_lock);
   1257 		}
   1258 	}
   1259 	return (dropped);
   1260 }
   1261 
   1262 /*
   1263  * Frees memory associated with the zone dataset list.
   1264  */
   1265 static void
   1266 zone_free_datasets(zone_t *zone)
   1267 {
   1268 	zone_dataset_t *t, *next;
   1269 
   1270 	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
   1271 		next = list_next(&zone->zone_datasets, t);
   1272 		list_remove(&zone->zone_datasets, t);
   1273 		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
   1274 		kmem_free(t, sizeof (*t));
   1275 	}
   1276 	list_destroy(&zone->zone_datasets);
   1277 }
   1278 
   1279 /*
   1280  * zone.cpu-shares resource control support.
   1281  */
   1282 /*ARGSUSED*/
   1283 static rctl_qty_t
   1284 zone_cpu_shares_usage(rctl_t *rctl, struct proc *p)
   1285 {
   1286 	ASSERT(MUTEX_HELD(&p->p_lock));
   1287 	return (p->p_zone->zone_shares);
   1288 }
   1289 
   1290 /*ARGSUSED*/
   1291 static int
   1292 zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
   1293     rctl_qty_t nv)
   1294 {
   1295 	ASSERT(MUTEX_HELD(&p->p_lock));
   1296 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1297 	if (e->rcep_p.zone == NULL)
   1298 		return (0);
   1299 
   1300 	e->rcep_p.zone->zone_shares = nv;
   1301 	return (0);
   1302 }
   1303 
   1304 static rctl_ops_t zone_cpu_shares_ops = {
   1305 	rcop_no_action,
   1306 	zone_cpu_shares_usage,
   1307 	zone_cpu_shares_set,
   1308 	rcop_no_test
   1309 };
   1310 
   1311 /*
   1312  * zone.cpu-cap resource control support.
   1313  */
   1314 /*ARGSUSED*/
   1315 static rctl_qty_t
   1316 zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
   1317 {
   1318 	ASSERT(MUTEX_HELD(&p->p_lock));
   1319 	return (cpucaps_zone_get(p->p_zone));
   1320 }
   1321 
   1322 /*ARGSUSED*/
   1323 static int
   1324 zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
   1325     rctl_qty_t nv)
   1326 {
   1327 	zone_t *zone = e->rcep_p.zone;
   1328 
   1329 	ASSERT(MUTEX_HELD(&p->p_lock));
   1330 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1331 
   1332 	if (zone == NULL)
   1333 		return (0);
   1334 
   1335 	/*
   1336 	 * set cap to the new value.
   1337 	 */
   1338 	return (cpucaps_zone_set(zone, nv));
   1339 }
   1340 
   1341 static rctl_ops_t zone_cpu_cap_ops = {
   1342 	rcop_no_action,
   1343 	zone_cpu_cap_get,
   1344 	zone_cpu_cap_set,
   1345 	rcop_no_test
   1346 };
   1347 
   1348 /*ARGSUSED*/
   1349 static rctl_qty_t
   1350 zone_lwps_usage(rctl_t *r, proc_t *p)
   1351 {
   1352 	rctl_qty_t nlwps;
   1353 	zone_t *zone = p->p_zone;
   1354 
   1355 	ASSERT(MUTEX_HELD(&p->p_lock));
   1356 
   1357 	mutex_enter(&zone->zone_nlwps_lock);
   1358 	nlwps = zone->zone_nlwps;
   1359 	mutex_exit(&zone->zone_nlwps_lock);
   1360 
   1361 	return (nlwps);
   1362 }
   1363 
   1364 /*ARGSUSED*/
   1365 static int
   1366 zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
   1367     rctl_qty_t incr, uint_t flags)
   1368 {
   1369 	rctl_qty_t nlwps;
   1370 
   1371 	ASSERT(MUTEX_HELD(&p->p_lock));
   1372 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1373 	if (e->rcep_p.zone == NULL)
   1374 		return (0);
   1375 	ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock)));
   1376 	nlwps = e->rcep_p.zone->zone_nlwps;
   1377 
   1378 	if (nlwps + incr > rcntl->rcv_value)
   1379 		return (1);
   1380 
   1381 	return (0);
   1382 }
   1383 
   1384 /*ARGSUSED*/
   1385 static int
   1386 zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv)
   1387 {
   1388 	ASSERT(MUTEX_HELD(&p->p_lock));
   1389 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1390 	if (e->rcep_p.zone == NULL)
   1391 		return (0);
   1392 	e->rcep_p.zone->zone_nlwps_ctl = nv;
   1393 	return (0);
   1394 }
   1395 
   1396 static rctl_ops_t zone_lwps_ops = {
   1397 	rcop_no_action,
   1398 	zone_lwps_usage,
   1399 	zone_lwps_set,
   1400 	zone_lwps_test,
   1401 };
   1402 
   1403 /*ARGSUSED*/
   1404 static int
   1405 zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
   1406     rctl_qty_t incr, uint_t flags)
   1407 {
   1408 	rctl_qty_t v;
   1409 	ASSERT(MUTEX_HELD(&p->p_lock));
   1410 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1411 	v = e->rcep_p.zone->zone_shmmax + incr;
   1412 	if (v > rval->rcv_value)
   1413 		return (1);
   1414 	return (0);
   1415 }
   1416 
   1417 static rctl_ops_t zone_shmmax_ops = {
   1418 	rcop_no_action,
   1419 	rcop_no_usage,
   1420 	rcop_no_set,
   1421 	zone_shmmax_test
   1422 };
   1423 
   1424 /*ARGSUSED*/
   1425 static int
   1426 zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
   1427     rctl_qty_t incr, uint_t flags)
   1428 {
   1429 	rctl_qty_t v;
   1430 	ASSERT(MUTEX_HELD(&p->p_lock));
   1431 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1432 	v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr;
   1433 	if (v > rval->rcv_value)
   1434 		return (1);
   1435 	return (0);
   1436 }
   1437 
   1438 static rctl_ops_t zone_shmmni_ops = {
   1439 	rcop_no_action,
   1440 	rcop_no_usage,
   1441 	rcop_no_set,
   1442 	zone_shmmni_test
   1443 };
   1444 
   1445 /*ARGSUSED*/
   1446 static int
   1447 zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
   1448     rctl_qty_t incr, uint_t flags)
   1449 {
   1450 	rctl_qty_t v;
   1451 	ASSERT(MUTEX_HELD(&p->p_lock));
   1452 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1453 	v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr;
   1454 	if (v > rval->rcv_value)
   1455 		return (1);
   1456 	return (0);
   1457 }
   1458 
   1459 static rctl_ops_t zone_semmni_ops = {
   1460 	rcop_no_action,
   1461 	rcop_no_usage,
   1462 	rcop_no_set,
   1463 	zone_semmni_test
   1464 };
   1465 
   1466 /*ARGSUSED*/
   1467 static int
   1468 zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval,
   1469     rctl_qty_t incr, uint_t flags)
   1470 {
   1471 	rctl_qty_t v;
   1472 	ASSERT(MUTEX_HELD(&p->p_lock));
   1473 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1474 	v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr;
   1475 	if (v > rval->rcv_value)
   1476 		return (1);
   1477 	return (0);
   1478 }
   1479 
   1480 static rctl_ops_t zone_msgmni_ops = {
   1481 	rcop_no_action,
   1482 	rcop_no_usage,
   1483 	rcop_no_set,
   1484 	zone_msgmni_test
   1485 };
   1486 
   1487 /*ARGSUSED*/
   1488 static rctl_qty_t
   1489 zone_locked_mem_usage(rctl_t *rctl, struct proc *p)
   1490 {
   1491 	rctl_qty_t q;
   1492 	ASSERT(MUTEX_HELD(&p->p_lock));
   1493 	mutex_enter(&p->p_zone->zone_mem_lock);
   1494 	q = p->p_zone->zone_locked_mem;
   1495 	mutex_exit(&p->p_zone->zone_mem_lock);
   1496 	return (q);
   1497 }
   1498 
   1499 /*ARGSUSED*/
   1500 static int
   1501 zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
   1502     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
   1503 {
   1504 	rctl_qty_t q;
   1505 	zone_t *z;
   1506 
   1507 	z = e->rcep_p.zone;
   1508 	ASSERT(MUTEX_HELD(&p->p_lock));
   1509 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
   1510 	q = z->zone_locked_mem;
   1511 	if (q + incr > rcntl->rcv_value)
   1512 		return (1);
   1513 	return (0);
   1514 }
   1515 
   1516 /*ARGSUSED*/
   1517 static int
   1518 zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
   1519     rctl_qty_t nv)
   1520 {
   1521 	ASSERT(MUTEX_HELD(&p->p_lock));
   1522 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1523 	if (e->rcep_p.zone == NULL)
   1524 		return (0);
   1525 	e->rcep_p.zone->zone_locked_mem_ctl = nv;
   1526 	return (0);
   1527 }
   1528 
   1529 static rctl_ops_t zone_locked_mem_ops = {
   1530 	rcop_no_action,
   1531 	zone_locked_mem_usage,
   1532 	zone_locked_mem_set,
   1533 	zone_locked_mem_test
   1534 };
   1535 
   1536 /*ARGSUSED*/
   1537 static rctl_qty_t
   1538 zone_max_swap_usage(rctl_t *rctl, struct proc *p)
   1539 {
   1540 	rctl_qty_t q;
   1541 	zone_t *z = p->p_zone;
   1542 
   1543 	ASSERT(MUTEX_HELD(&p->p_lock));
   1544 	mutex_enter(&z->zone_mem_lock);
   1545 	q = z->zone_max_swap;
   1546 	mutex_exit(&z->zone_mem_lock);
   1547 	return (q);
   1548 }
   1549 
   1550 /*ARGSUSED*/
   1551 static int
   1552 zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
   1553     rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags)
   1554 {
   1555 	rctl_qty_t q;
   1556 	zone_t *z;
   1557 
   1558 	z = e->rcep_p.zone;
   1559 	ASSERT(MUTEX_HELD(&p->p_lock));
   1560 	ASSERT(MUTEX_HELD(&z->zone_mem_lock));
   1561 	q = z->zone_max_swap;
   1562 	if (q + incr > rcntl->rcv_value)
   1563 		return (1);
   1564 	return (0);
   1565 }
   1566 
   1567 /*ARGSUSED*/
   1568 static int
   1569 zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
   1570     rctl_qty_t nv)
   1571 {
   1572 	ASSERT(MUTEX_HELD(&p->p_lock));
   1573 	ASSERT(e->rcep_t == RCENTITY_ZONE);
   1574 	if (e->rcep_p.zone == NULL)
   1575 		return (0);
   1576 	e->rcep_p.zone->zone_max_swap_ctl = nv;
   1577 	return (0);
   1578 }
   1579 
   1580 static rctl_ops_t zone_max_swap_ops = {
   1581 	rcop_no_action,
   1582 	zone_max_swap_usage,
   1583 	zone_max_swap_set,
   1584 	zone_max_swap_test
   1585 };
   1586 
   1587 /*
   1588  * Helper function to brand the zone with a unique ID.
   1589  */
   1590 static void
   1591 zone_uniqid(zone_t *zone)
   1592 {
   1593 	static uint64_t uniqid = 0;
   1594 
   1595 	ASSERT(MUTEX_HELD(&zonehash_lock));
   1596 	zone->zone_uniqid = uniqid++;
   1597 }
   1598 
   1599 /*
   1600  * Returns a held pointer to the "kcred" for the specified zone.
   1601  */
   1602 struct cred *
   1603 zone_get_kcred(zoneid_t zoneid)
   1604 {
   1605 	zone_t *zone;
   1606 	cred_t *cr;
   1607 
   1608 	if ((zone = zone_find_by_id(zoneid)) == NULL)
   1609 		return (NULL);
   1610 	cr = zone->zone_kcred;
   1611 	crhold(cr);
   1612 	zone_rele(zone);
   1613 	return (cr);
   1614 }
   1615 
   1616 static int
   1617 zone_lockedmem_kstat_update(kstat_t *ksp, int rw)
   1618 {
   1619 	zone_t *zone = ksp->ks_private;
   1620 	zone_kstat_t *zk = ksp->ks_data;
   1621 
   1622 	if (rw == KSTAT_WRITE)
   1623 		return (EACCES);
   1624 
   1625 	zk->zk_usage.value.ui64 = zone->zone_locked_mem;
   1626 	zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl;
   1627 	return (0);
   1628 }
   1629 
   1630 static int
   1631 zone_swapresv_kstat_update(kstat_t *ksp, int rw)
   1632 {
   1633 	zone_t *zone = ksp->ks_private;
   1634 	zone_kstat_t *zk = ksp->ks_data;
   1635 
   1636 	if (rw == KSTAT_WRITE)
   1637 		return (EACCES);
   1638 
   1639 	zk->zk_usage.value.ui64 = zone->zone_max_swap;
   1640 	zk->zk_value.value.ui64 = zone->zone_max_swap_ctl;
   1641 	return (0);
   1642 }
   1643 
   1644 static void
   1645 zone_kstat_create(zone_t *zone)
   1646 {
   1647 	kstat_t *ksp;
   1648 	zone_kstat_t *zk;
   1649 
   1650 	ksp = rctl_kstat_create_zone(zone, "lockedmem", KSTAT_TYPE_NAMED,
   1651 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
   1652 	    KSTAT_FLAG_VIRTUAL);
   1653 
   1654 	if (ksp == NULL)
   1655 		return;
   1656 
   1657 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
   1658 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
   1659 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
   1660 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
   1661 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
   1662 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
   1663 	ksp->ks_update = zone_lockedmem_kstat_update;
   1664 	ksp->ks_private = zone;
   1665 	kstat_install(ksp);
   1666 
   1667 	zone->zone_lockedmem_kstat = ksp;
   1668 
   1669 	ksp = rctl_kstat_create_zone(zone, "swapresv", KSTAT_TYPE_NAMED,
   1670 	    sizeof (zone_kstat_t) / sizeof (kstat_named_t),
   1671 	    KSTAT_FLAG_VIRTUAL);
   1672 
   1673 	if (ksp == NULL)
   1674 		return;
   1675 
   1676 	zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP);
   1677 	ksp->ks_data_size += strlen(zone->zone_name) + 1;
   1678 	kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING);
   1679 	kstat_named_setstr(&zk->zk_zonename, zone->zone_name);
   1680 	kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64);
   1681 	kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64);
   1682 	ksp->ks_update = zone_swapresv_kstat_update;
   1683 	ksp->ks_private = zone;
   1684 	kstat_install(ksp);
   1685 
   1686 	zone->zone_swapresv_kstat = ksp;
   1687 }
   1688 
   1689 static void
   1690 zone_kstat_delete(zone_t *zone)
   1691 {
   1692 	void *data;
   1693 
   1694 	if (zone->zone_lockedmem_kstat != NULL) {
   1695 		data = zone->zone_lockedmem_kstat->ks_data;
   1696 		kstat_delete(zone->zone_lockedmem_kstat);
   1697 		kmem_free(data, sizeof (zone_kstat_t));
   1698 	}
   1699 	if (zone->zone_swapresv_kstat != NULL) {
   1700 		data = zone->zone_swapresv_kstat->ks_data;
   1701 		kstat_delete(zone->zone_swapresv_kstat);
   1702 		kmem_free(data, sizeof (zone_kstat_t));
   1703 	}
   1704 }
   1705 
   1706 /*
   1707  * Called very early on in boot to initialize the ZSD list so that
   1708  * zone_key_create() can be called before zone_init().  It also initializes
   1709  * portions of zone0 which may be used before zone_init() is called.  The
   1710  * variable "global_zone" will be set when zone0 is fully initialized by
   1711  * zone_init().
   1712  */
   1713 void
   1714 zone_zsd_init(void)
   1715 {
   1716 	mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL);
   1717 	mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL);
   1718 	list_create(&zsd_registered_keys, sizeof (struct zsd_entry),
   1719 	    offsetof(struct zsd_entry, zsd_linkage));
   1720 	list_create(&zone_active, sizeof (zone_t),
   1721 	    offsetof(zone_t, zone_linkage));
   1722 	list_create(&zone_deathrow, sizeof (zone_t),
   1723 	    offsetof(zone_t, zone_linkage));
   1724 
   1725 	mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL);
   1726 	mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
   1727 	mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
   1728 	zone0.zone_shares = 1;
   1729 	zone0.zone_nlwps = 0;
   1730 	zone0.zone_nlwps_ctl = INT_MAX;
   1731 	zone0.zone_locked_mem = 0;
   1732 	zone0.zone_locked_mem_ctl = UINT64_MAX;
   1733 	ASSERT(zone0.zone_max_swap == 0);
   1734 	zone0.zone_max_swap_ctl = UINT64_MAX;
   1735 	zone0.zone_shmmax = 0;
   1736 	zone0.zone_ipc.ipcq_shmmni = 0;
   1737 	zone0.zone_ipc.ipcq_semmni = 0;
   1738 	zone0.zone_ipc.ipcq_msgmni = 0;
   1739 	zone0.zone_name = GLOBAL_ZONENAME;
   1740 	zone0.zone_nodename = utsname.nodename;
   1741 	zone0.zone_domain = srpc_domain;
   1742 	zone0.zone_hostid = HW_INVALID_HOSTID;
   1743 	zone0.zone_ref = 1;
   1744 	zone0.zone_id = GLOBAL_ZONEID;
   1745 	zone0.zone_status = ZONE_IS_RUNNING;
   1746 	zone0.zone_rootpath = "/";
   1747 	zone0.zone_rootpathlen = 2;
   1748 	zone0.zone_psetid = ZONE_PS_INVAL;
   1749 	zone0.zone_ncpus = 0;
   1750 	zone0.zone_ncpus_online = 0;
   1751 	zone0.zone_proc_initpid = 1;
   1752 	zone0.zone_initname = initname;
   1753 	zone0.zone_lockedmem_kstat = NULL;
   1754 	zone0.zone_swapresv_kstat = NULL;
   1755 	list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
   1756 	    offsetof(struct zsd_entry, zsd_linkage));
   1757 	list_insert_head(&zone_active, &zone0);
   1758 
   1759 	/*
   1760 	 * The root filesystem is not mounted yet, so zone_rootvp cannot be set
   1761 	 * to anything meaningful.  It is assigned to be 'rootdir' in
   1762 	 * vfs_mountroot().
   1763 	 */
   1764 	zone0.zone_rootvp = NULL;
   1765 	zone0.zone_vfslist = NULL;
   1766 	zone0.zone_bootargs = initargs;
   1767 	zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
   1768 	/*
   1769 	 * The global zone has all privileges
   1770 	 */
   1771 	priv_fillset(zone0.zone_privset);
   1772 	/*
   1773 	 * Add p0 to the global zone
   1774 	 */
   1775 	zone0.zone_zsched = &p0;
   1776 	p0.p_zone = &zone0;
   1777 }
   1778 
   1779 /*
   1780  * Compute a hash value based on the contents of the label and the DOI.  The
   1781  * hash algorithm is somewhat arbitrary, but is based on the observation that
   1782  * humans will likely pick labels that differ by amounts that work out to be
   1783  * multiples of the number of hash chains, and thus stirring in some primes
   1784  * should help.
   1785  */
   1786 static uint_t
   1787 hash_bylabel(void *hdata, mod_hash_key_t key)
   1788 {
   1789 	const ts_label_t *lab = (ts_label_t *)key;
   1790 	const uint32_t *up, *ue;
   1791 	uint_t hash;
   1792 	int i;
   1793 
   1794 	_NOTE(ARGUNUSED(hdata));
   1795 
   1796 	hash = lab->tsl_doi + (lab->tsl_doi << 1);
   1797 	/* we depend on alignment of label, but not representation */
   1798 	up = (const uint32_t *)&lab->tsl_label;
   1799 	ue = up + sizeof (lab->tsl_label) / sizeof (*up);
   1800 	i = 1;
   1801 	while (up < ue) {
   1802 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
   1803 		hash += *up + (*up << ((i % 16) + 1));
   1804 		up++;
   1805 		i++;
   1806 	}
   1807 	return (hash);
   1808 }
   1809 
   1810 /*
   1811  * All that mod_hash cares about here is zero (equal) versus non-zero (not
   1812  * equal).  This may need to be changed if less than / greater than is ever
   1813  * needed.
   1814  */
   1815 static int
   1816 hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
   1817 {
   1818 	ts_label_t *lab1 = (ts_label_t *)key1;
   1819 	ts_label_t *lab2 = (ts_label_t *)key2;
   1820 
   1821 	return (label_equal(lab1, lab2) ? 0 : 1);
   1822 }
   1823 
   1824 /*
   1825  * Called by main() to initialize the zones framework.
   1826  */
   1827 void
   1828 zone_init(void)
   1829 {
   1830 	rctl_dict_entry_t *rde;
   1831 	rctl_val_t *dval;
   1832 	rctl_set_t *set;
   1833 	rctl_alloc_gp_t *gp;
   1834 	rctl_entity_p_t e;
   1835 	int res;
   1836 
   1837 	ASSERT(curproc == &p0);
   1838 
   1839 	/*
   1840 	 * Create ID space for zone IDs.  ID 0 is reserved for the
   1841 	 * global zone.
   1842 	 */
   1843 	zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID);
   1844 
   1845 	/*
   1846 	 * Initialize generic zone resource controls, if any.
   1847 	 */
   1848 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
   1849 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
   1850 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
   1851 	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
   1852 
   1853 	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
   1854 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
   1855 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
   1856 	    RCTL_GLOBAL_INFINITE,
   1857 	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
   1858 
   1859 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
   1860 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
   1861 	    INT_MAX, INT_MAX, &zone_lwps_ops);
   1862 	/*
   1863 	 * System V IPC resource controls
   1864 	 */
   1865 	rc_zone_msgmni = rctl_register("zone.max-msg-ids",
   1866 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
   1867 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops);
   1868 
   1869 	rc_zone_semmni = rctl_register("zone.max-sem-ids",
   1870 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
   1871 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops);
   1872 
   1873 	rc_zone_shmmni = rctl_register("zone.max-shm-ids",
   1874 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
   1875 	    RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops);
   1876 
   1877 	rc_zone_shmmax = rctl_register("zone.max-shm-memory",
   1878 	    RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
   1879 	    RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops);
   1880 
   1881 	/*
   1882 	 * Create a rctl_val with PRIVILEGED, NOACTION, value = 1.  Then attach
   1883 	 * this at the head of the rctl_dict_entry for ``zone.cpu-shares''.
   1884 	 */
   1885 	dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
   1886 	bzero(dval, sizeof (rctl_val_t));
   1887 	dval->rcv_value = 1;
   1888 	dval->rcv_privilege = RCPRIV_PRIVILEGED;
   1889 	dval->rcv_flagaction = RCTL_LOCAL_NOACTION;
   1890 	dval->rcv_action_recip_pid = -1;
   1891 
   1892 	rde = rctl_dict_lookup("zone.cpu-shares");
   1893 	(void) rctl_val_list_insert(&rde->rcd_default_value, dval);
   1894 
   1895 	rc_zone_locked_mem = rctl_register("zone.max-locked-memory",
   1896 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
   1897 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
   1898 	    &zone_locked_mem_ops);
   1899 
   1900 	rc_zone_max_swap = rctl_register("zone.max-swap",
   1901 	    RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES |
   1902 	    RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX,
   1903 	    &zone_max_swap_ops);
   1904 
   1905 	/*
   1906 	 * Initialize the ``global zone''.
   1907 	 */
   1908 	set = rctl_set_create();
   1909 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
   1910 	mutex_enter(&p0.p_lock);
   1911 	e.rcep_p.zone = &zone0;
   1912 	e.rcep_t = RCENTITY_ZONE;
   1913 	zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set,
   1914 	    gp);
   1915 
   1916 	zone0.zone_nlwps = p0.p_lwpcnt;
   1917 	zone0.zone_ntasks = 1;
   1918 	mutex_exit(&p0.p_lock);
   1919 	zone0.zone_restart_init = B_TRUE;
   1920 	zone0.zone_brand = &native_brand;
   1921 	rctl_prealloc_destroy(gp);
   1922 	/*
   1923 	 * pool_default hasn't been initialized yet, so we let pool_init()
   1924 	 * take care of making sure the global zone is in the default pool.
   1925 	 */
   1926 
   1927 	/*
   1928 	 * Initialize global zone kstats
   1929 	 */
   1930 	zone_kstat_create(&zone0);
   1931 
   1932 	/*
   1933 	 * Initialize zone label.
   1934 	 * mlp are initialized when tnzonecfg is loaded.
   1935 	 */
   1936 	zone0.zone_slabel = l_admin_low;
   1937 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
   1938 	label_hold(l_admin_low);
   1939 
   1940 	/*
   1941 	 * Initialise the lock for the database structure used by mntfs.
   1942 	 */
   1943 	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
   1944 
   1945 	mutex_enter(&zonehash_lock);
   1946 	zone_uniqid(&zone0);
   1947 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
   1948 
   1949 	zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size,
   1950 	    mod_hash_null_valdtor);
   1951 	zonehashbyname = mod_hash_create_strhash("zone_by_name",
   1952 	    zone_hash_size, mod_hash_null_valdtor);
   1953 	/*
   1954 	 * maintain zonehashbylabel only for labeled systems
   1955 	 */
   1956 	if (is_system_labeled())
   1957 		zonehashbylabel = mod_hash_create_extended("zone_by_label",
   1958 		    zone_hash_size, mod_hash_null_keydtor,
   1959 		    mod_hash_null_valdtor, hash_bylabel, NULL,
   1960 		    hash_labelkey_cmp, KM_SLEEP);
   1961 	zonecount = 1;
   1962 
   1963 	(void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID,
   1964 	    (mod_hash_val_t)&zone0);
   1965 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name,
   1966 	    (mod_hash_val_t)&zone0);
   1967 	if (is_system_labeled()) {
   1968 		zone0.zone_flags |= ZF_HASHED_LABEL;
   1969 		(void) mod_hash_insert(zonehashbylabel,
   1970 		    (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0);
   1971 	}
   1972 	mutex_exit(&zonehash_lock);
   1973 
   1974 	/*
   1975 	 * We avoid setting zone_kcred until now, since kcred is initialized
   1976 	 * sometime after zone_zsd_init() and before zone_init().
   1977 	 */
   1978 	zone0.zone_kcred = kcred;
   1979 	/*
   1980 	 * The global zone is fully initialized (except for zone_rootvp which
   1981 	 * will be set when the root filesystem is mounted).
   1982 	 */
   1983 	global_zone = &zone0;
   1984 
   1985 	/*
   1986 	 * Setup an event channel to send zone status change notifications on
   1987 	 */
   1988 	res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
   1989 	    EVCH_CREAT);
   1990 
   1991 	if (res)
   1992 		panic("Sysevent_evc_bind failed during zone setup.\n");
   1993 
   1994 }
   1995 
   1996 static void
   1997 zone_free(zone_t *zone)
   1998 {
   1999 	ASSERT(zone != global_zone);
   2000 	ASSERT(zone->zone_ntasks == 0);
   2001 	ASSERT(zone->zone_nlwps == 0);
   2002 	ASSERT(zone->zone_cred_ref == 0);
   2003 	ASSERT(zone->zone_kcred == NULL);
   2004 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
   2005 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
   2006 
   2007 	/*
   2008 	 * Remove any zone caps.
   2009 	 */
   2010 	cpucaps_zone_remove(zone);
   2011 
   2012 	ASSERT(zone->zone_cpucap == NULL);
   2013 
   2014 	/* remove from deathrow list */
   2015 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
   2016 		ASSERT(zone->zone_ref == 0);
   2017 		mutex_enter(&zone_deathrow_lock);
   2018 		list_remove(&zone_deathrow, zone);
   2019 		mutex_exit(&zone_deathrow_lock);
   2020 	}
   2021 
   2022 	zone_free_zsd(zone);
   2023 	zone_free_datasets(zone);
   2024 	list_destroy(&zone->zone_dl_list);
   2025 
   2026 	if (zone->zone_rootvp != NULL)
   2027 		VN_RELE(zone->zone_rootvp);
   2028 	if (zone->zone_rootpath)
   2029 		kmem_free(zone->zone_rootpath, zone->zone_rootpathlen);
   2030 	if (zone->zone_name != NULL)
   2031 		kmem_free(zone->zone_name, ZONENAME_MAX);
   2032 	if (zone->zone_slabel != NULL)
   2033 		label_rele(zone->zone_slabel);
   2034 	if (zone->zone_nodename != NULL)
   2035 		kmem_free(zone->zone_nodename, _SYS_NMLN);
   2036 	if (zone->zone_domain != NULL)
   2037 		kmem_free(zone->zone_domain, _SYS_NMLN);
   2038 	if (zone->zone_privset != NULL)
   2039 		kmem_free(zone->zone_privset, sizeof (priv_set_t));
   2040 	if (zone->zone_rctls != NULL)
   2041 		rctl_set_free(zone->zone_rctls);
   2042 	if (zone->zone_bootargs != NULL)
   2043 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
   2044 	if (zone->zone_initname != NULL)
   2045 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
   2046 	id_free(zoneid_space, zone->zone_id);
   2047 	mutex_destroy(&zone->zone_lock);
   2048 	cv_destroy(&zone->zone_cv);
   2049 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
   2050 	rw_destroy(&zone->zone_mntfs_db_lock);
   2051 	kmem_free(zone, sizeof (zone_t));
   2052 }
   2053 
   2054 /*
   2055  * See block comment at the top of this file for information about zone
   2056  * status values.
   2057  */
   2058 /*
   2059  * Convenience function for setting zone status.
   2060  */
   2061 static void
   2062 zone_status_set(zone_t *zone, zone_status_t status)
   2063 {
   2064 
   2065 	nvlist_t *nvl = NULL;
   2066 	ASSERT(MUTEX_HELD(&zone_status_lock));
   2067 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
   2068 	    status >= zone_status_get(zone));
   2069 
   2070 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
   2071 	    nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
   2072 	    nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
   2073 	    zone_status_table[status]) ||
   2074 	    nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
   2075 	    zone_status_table[zone->zone_status]) ||
   2076 	    nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
   2077 	    nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
   2078 	    sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
   2079 	    ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) {
   2080 #ifdef DEBUG
   2081 		(void) printf(
   2082 		    "Failed to allocate and send zone state change event.\n");
   2083 #endif
   2084 	}
   2085 	nvlist_free(nvl);
   2086 
   2087 	zone->zone_status = status;
   2088 
   2089 	cv_broadcast(&zone->zone_cv);
   2090 }
   2091 
   2092 /*
   2093  * Public function to retrieve the zone status.  The zone status may
   2094  * change after it is retrieved.
   2095  */
   2096 zone_status_t
   2097 zone_status_get(zone_t *zone)
   2098 {
   2099 	return (zone->zone_status);
   2100 }
   2101 
   2102 static int
   2103 zone_set_bootargs(zone_t *zone, const char *zone_bootargs)
   2104 {
   2105 	char *bootargs = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP);
   2106 	int err = 0;
   2107 
   2108 	ASSERT(zone != global_zone);
   2109 	if ((err = copyinstr(zone_bootargs, bootargs, BOOTARGS_MAX, NULL)) != 0)
   2110 		goto done;	/* EFAULT or ENAMETOOLONG */
   2111 
   2112 	if (zone->zone_bootargs != NULL)
   2113 		kmem_free(zone->zone_bootargs, strlen(zone->zone_bootargs) + 1);
   2114 
   2115 	zone->zone_bootargs = kmem_alloc(strlen(bootargs) + 1, KM_SLEEP);
   2116 	(void) strcpy(zone->zone_bootargs, bootargs);
   2117 
   2118 done:
   2119 	kmem_free(bootargs, BOOTARGS_MAX);
   2120 	return (err);
   2121 }
   2122 
   2123 static int
   2124 zone_set_brand(zone_t *zone, const char *brand)
   2125 {
   2126 	struct brand_attr *attrp;
   2127 	brand_t *bp;
   2128 
   2129 	attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP);
   2130 	if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) {
   2131 		kmem_free(attrp, sizeof (struct brand_attr));
   2132 		return (EFAULT);
   2133 	}
   2134 
   2135 	bp = brand_register_zone(attrp);
   2136 	kmem_free(attrp, sizeof (struct brand_attr));
   2137 	if (bp == NULL)
   2138 		return (EINVAL);
   2139 
   2140 	/*
   2141 	 * This is the only place where a zone can change it's brand.
   2142 	 * We already need to hold zone_status_lock to check the zone
   2143 	 * status, so we'll just use that lock to serialize zone
   2144 	 * branding requests as well.
   2145 	 */
   2146 	mutex_enter(&zone_status_lock);
   2147 
   2148 	/* Re-Branding is not allowed and the zone can't be booted yet */
   2149 	if ((ZONE_IS_BRANDED(zone)) ||
   2150 	    (zone_status_get(zone) >= ZONE_IS_BOOTING)) {
   2151 		mutex_exit(&zone_status_lock);
   2152 		brand_unregister_zone(bp);
   2153 		return (EINVAL);
   2154 	}
   2155 
   2156 	/* set up the brand specific data */
   2157 	zone->zone_brand = bp;
   2158 	ZBROP(zone)->b_init_brand_data(zone);
   2159 
   2160 	mutex_exit(&zone_status_lock);
   2161 	return (0);
   2162 }
   2163 
   2164 static int
   2165 zone_set_initname(zone_t *zone, const char *zone_initname)
   2166 {
   2167 	char initname[INITNAME_SZ];
   2168 	size_t len;
   2169 	int err = 0;
   2170 
   2171 	ASSERT(zone != global_zone);
   2172 	if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0)
   2173 		return (err);	/* EFAULT or ENAMETOOLONG */
   2174 
   2175 	if (zone->zone_initname != NULL)
   2176 		kmem_free(zone->zone_initname, strlen(zone->zone_initname) + 1);
   2177 
   2178 	zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP);
   2179 	(void) strcpy(zone->zone_initname, initname);
   2180 	return (0);
   2181 }
   2182 
   2183 static int
   2184 zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap)
   2185 {
   2186 	uint64_t mcap;
   2187 	int err = 0;
   2188 
   2189 	if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0)
   2190 		zone->zone_phys_mcap = mcap;
   2191 
   2192 	return (err);
   2193 }
   2194 
   2195 static int
   2196 zone_set_sched_class(zone_t *zone, const char *new_class)
   2197 {
   2198 	char sched_class[PC_CLNMSZ];
   2199 	id_t classid;
   2200 	int err;
   2201 
   2202 	ASSERT(zone != global_zone);
   2203 	if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0)
   2204 		return (err);	/* EFAULT or ENAMETOOLONG */
   2205 
   2206 	if (getcid(sched_class, &classid) != 0 || classid == syscid)
   2207 		return (set_errno(EINVAL));
   2208 	zone->zone_defaultcid = classid;
   2209 	ASSERT(zone->zone_defaultcid > 0 &&
   2210 	    zone->zone_defaultcid < loaded_classes);
   2211 
   2212 	return (0);
   2213 }
   2214 
   2215 /*
   2216  * Block indefinitely waiting for (zone_status >= status)
   2217  */
   2218 void
   2219 zone_status_wait(zone_t *zone, zone_status_t status)
   2220 {
   2221 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
   2222 
   2223 	mutex_enter(&zone_status_lock);
   2224 	while (zone->zone_status < status) {
   2225 		cv_wait(&zone->zone_cv, &zone_status_lock);
   2226 	}
   2227 	mutex_exit(&zone_status_lock);
   2228 }
   2229 
   2230 /*
   2231  * Private CPR-safe version of zone_status_wait().
   2232  */
   2233 static void
   2234 zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str)
   2235 {
   2236 	callb_cpr_t cprinfo;
   2237 
   2238 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
   2239 
   2240 	CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr,
   2241 	    str);
   2242 	mutex_enter(&zone_status_lock);
   2243 	while (zone->zone_status < status) {
   2244 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2245 		cv_wait(&zone->zone_cv, &zone_status_lock);
   2246 		CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock);
   2247 	}
   2248 	/*
   2249 	 * zone_status_lock is implicitly released by the following.
   2250 	 */
   2251 	CALLB_CPR_EXIT(&cprinfo);
   2252 }
   2253 
   2254 /*
   2255  * Block until zone enters requested state or signal is received.  Return (0)
   2256  * if signaled, non-zero otherwise.
   2257  */
   2258 int
   2259 zone_status_wait_sig(zone_t *zone, zone_status_t status)
   2260 {
   2261 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
   2262 
   2263 	mutex_enter(&zone_status_lock);
   2264 	while (zone->zone_status < status) {
   2265 		if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) {
   2266 			mutex_exit(&zone_status_lock);
   2267 			return (0);
   2268 		}
   2269 	}
   2270 	mutex_exit(&zone_status_lock);
   2271 	return (1);
   2272 }
   2273 
   2274 /*
   2275  * Block until the zone enters the requested state or the timeout expires,
   2276  * whichever happens first.  Return (-1) if operation timed out, time remaining
   2277  * otherwise.
   2278  */
   2279 clock_t
   2280 zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status)
   2281 {
   2282 	clock_t timeleft = 0;
   2283 
   2284 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
   2285 
   2286 	mutex_enter(&zone_status_lock);
   2287 	while (zone->zone_status < status && timeleft != -1) {
   2288 		timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim);
   2289 	}
   2290 	mutex_exit(&zone_status_lock);
   2291 	return (timeleft);
   2292 }
   2293 
   2294 /*
   2295  * Block until the zone enters the requested state, the current process is
   2296  * signaled,  or the timeout expires, whichever happens first.  Return (-1) if
   2297  * operation timed out, 0 if signaled, time remaining otherwise.
   2298  */
   2299 clock_t
   2300 zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
   2301 {
   2302 	clock_t timeleft = tim - ddi_get_lbolt();
   2303 
   2304 	ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE);
   2305 
   2306 	mutex_enter(&zone_status_lock);
   2307 	while (zone->zone_status < status) {
   2308 		timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock,
   2309 		    tim);
   2310 		if (timeleft <= 0)
   2311 			break;
   2312 	}
   2313 	mutex_exit(&zone_status_lock);
   2314 	return (timeleft);
   2315 }
   2316 
   2317 /*
   2318  * Zones have two reference counts: one for references from credential
   2319  * structures (zone_cred_ref), and one (zone_ref) for everything else.
   2320  * This is so we can allow a zone to be rebooted while there are still
   2321  * outstanding cred references, since certain drivers cache dblks (which
   2322  * implicitly results in cached creds).  We wait for zone_ref to drop to
   2323  * 0 (actually 1), but not zone_cred_ref.  The zone structure itself is
   2324  * later freed when the zone_cred_ref drops to 0, though nothing other
   2325  * than the zone id and privilege set should be accessed once the zone
   2326  * is "dead".
   2327  *
   2328  * A debugging flag, zone_wait_for_cred, can be set to a non-zero value
   2329  * to force halt/reboot to block waiting for the zone_cred_ref to drop
   2330  * to 0.  This can be useful to flush out other sources of cached creds
   2331  * that may be less innocuous than the driver case.
   2332  */
   2333 
   2334 int zone_wait_for_cred = 0;
   2335 
   2336 static void
   2337 zone_hold_locked(zone_t *z)
   2338 {
   2339 	ASSERT(MUTEX_HELD(&z->zone_lock));
   2340 	z->zone_ref++;
   2341 	ASSERT(z->zone_ref != 0);
   2342 }
   2343 
   2344 void
   2345 zone_hold(zone_t *z)
   2346 {
   2347 	mutex_enter(&z->zone_lock);
   2348 	zone_hold_locked(z);
   2349 	mutex_exit(&z->zone_lock);
   2350 }
   2351 
   2352 /*
   2353  * If the non-cred ref count drops to 1 and either the cred ref count
   2354  * is 0 or we aren't waiting for cred references, the zone is ready to
   2355  * be destroyed.
   2356  */
   2357 #define	ZONE_IS_UNREF(zone)	((zone)->zone_ref == 1 && \
   2358 	    (!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
   2359 
   2360 void
   2361 zone_rele(zone_t *z)
   2362 {
   2363 	boolean_t wakeup;
   2364 
   2365 	mutex_enter(&z->zone_lock);
   2366 	ASSERT(z->zone_ref != 0);
   2367 	z->zone_ref--;
   2368 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
   2369 		/* no more refs, free the structure */
   2370 		mutex_exit(&z->zone_lock);
   2371 		zone_free(z);
   2372 		return;
   2373 	}
   2374 	/* signal zone_destroy so the zone can finish halting */
   2375 	wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD);
   2376 	mutex_exit(&z->zone_lock);
   2377 
   2378 	if (wakeup) {
   2379 		/*
   2380 		 * Grabbing zonehash_lock here effectively synchronizes with
   2381 		 * zone_destroy() to avoid missed signals.
   2382 		 */
   2383 		mutex_enter(&zonehash_lock);
   2384 		cv_broadcast(&zone_destroy_cv);
   2385 		mutex_exit(&zonehash_lock);
   2386 	}
   2387 }
   2388 
   2389 void
   2390 zone_cred_hold(zone_t *z)
   2391 {
   2392 	mutex_enter(&z->zone_lock);
   2393 	z->zone_cred_ref++;
   2394 	ASSERT(z->zone_cred_ref != 0);
   2395 	mutex_exit(&z->zone_lock);
   2396 }
   2397 
   2398 void
   2399 zone_cred_rele(zone_t *z)
   2400 {
   2401 	boolean_t wakeup;
   2402 
   2403 	mutex_enter(&z->zone_lock);
   2404 	ASSERT(z->zone_cred_ref != 0);
   2405 	z->zone_cred_ref--;
   2406 	if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
   2407 		/* no more refs, free the structure */
   2408 		mutex_exit(&z->zone_lock);
   2409 		zone_free(z);
   2410 		return;
   2411 	}
   2412 	/*
   2413 	 * If zone_destroy is waiting for the cred references to drain
   2414 	 * out, and they have, signal it.
   2415 	 */
   2416 	wakeup = (zone_wait_for_cred && ZONE_IS_UNREF(z) &&
   2417 	    zone_status_get(z) >= ZONE_IS_DEAD);
   2418 	mutex_exit(&z->zone_lock);
   2419 
   2420 	if (wakeup) {
   2421 		/*
   2422 		 * Grabbing zonehash_lock here effectively synchronizes with
   2423 		 * zone_destroy() to avoid missed signals.
   2424 		 */
   2425 		mutex_enter(&zonehash_lock);
   2426 		cv_broadcast(&zone_destroy_cv);
   2427 		mutex_exit(&zonehash_lock);
   2428 	}
   2429 }
   2430 
   2431 void
   2432 zone_task_hold(zone_t *z)
   2433 {
   2434 	mutex_enter(&z->zone_lock);
   2435 	z->zone_ntasks++;
   2436 	ASSERT(z->zone_ntasks != 0);
   2437 	mutex_exit(&z->zone_lock);
   2438 }
   2439 
   2440 void
   2441 zone_task_rele(zone_t *zone)
   2442 {
   2443 	uint_t refcnt;
   2444 
   2445 	mutex_enter(&zone->zone_lock);
   2446 	ASSERT(zone->zone_ntasks != 0);
   2447 	refcnt = --zone->zone_ntasks;
   2448 	if (refcnt > 1)	{	/* Common case */
   2449 		mutex_exit(&zone->zone_lock);
   2450 		return;
   2451 	}
   2452 	zone_hold_locked(zone);	/* so we can use the zone_t later */
   2453 	mutex_exit(&zone->zone_lock);
   2454 	if (refcnt == 1) {
   2455 		/*
   2456 		 * See if the zone is shutting down.
   2457 		 */
   2458 		mutex_enter(&zone_status_lock);
   2459 		if (zone_status_get(zone) != ZONE_IS_SHUTTING_DOWN) {
   2460 			goto out;
   2461 		}
   2462 
   2463 		/*
   2464 		 * Make sure the ntasks didn't change since we
   2465 		 * dropped zone_lock.
   2466 		 */
   2467 		mutex_enter(&zone->zone_lock);
   2468 		if (refcnt != zone->zone_ntasks) {
   2469 			mutex_exit(&zone->zone_lock);
   2470 			goto out;
   2471 		}
   2472 		mutex_exit(&zone->zone_lock);
   2473 
   2474 		/*
   2475 		 * No more user processes in the zone.  The zone is empty.
   2476 		 */
   2477 		zone_status_set(zone, ZONE_IS_EMPTY);
   2478 		goto out;
   2479 	}
   2480 
   2481 	ASSERT(refcnt == 0);
   2482 	/*
   2483 	 * zsched has exited; the zone is dead.
   2484 	 */
   2485 	zone->zone_zsched = NULL;		/* paranoia */
   2486 	mutex_enter(&zone_status_lock);
   2487 	zone_status_set(zone, ZONE_IS_DEAD);
   2488 out:
   2489 	mutex_exit(&zone_status_lock);
   2490 	zone_rele(zone);
   2491 }
   2492 
   2493 zoneid_t
   2494 getzoneid(void)
   2495 {
   2496 	return (curproc->p_zone->zone_id);
   2497 }
   2498 
   2499 /*
   2500  * Internal versions of zone_find_by_*().  These don't zone_hold() or
   2501  * check the validity of a zone's state.
   2502  */
   2503 static zone_t *
   2504 zone_find_all_by_id(zoneid_t zoneid)
   2505 {
   2506 	mod_hash_val_t hv;
   2507 	zone_t *zone = NULL;
   2508 
   2509 	ASSERT(MUTEX_HELD(&zonehash_lock));
   2510 
   2511 	if (mod_hash_find(zonehashbyid,
   2512 	    (mod_hash_key_t)(uintptr_t)zoneid, &hv) == 0)
   2513 		zone = (zone_t *)hv;
   2514 	return (zone);
   2515 }
   2516 
   2517 static zone_t *
   2518 zone_find_all_by_label(const ts_label_t *label)
   2519 {
   2520 	mod_hash_val_t hv;
   2521 	zone_t *zone = NULL;
   2522 
   2523 	ASSERT(MUTEX_HELD(&zonehash_lock));
   2524 
   2525 	/*
   2526 	 * zonehashbylabel is not maintained for unlabeled systems
   2527 	 */
   2528 	if (!is_system_labeled())
   2529 		return (NULL);
   2530 	if (mod_hash_find(zonehashbylabel, (mod_hash_key_t)label, &hv) == 0)
   2531 		zone = (zone_t *)hv;
   2532 	return (zone);
   2533 }
   2534 
   2535 static zone_t *
   2536 zone_find_all_by_name(char *name)
   2537 {
   2538 	mod_hash_val_t hv;
   2539 	zone_t *zone = NULL;
   2540 
   2541 	ASSERT(MUTEX_HELD(&zonehash_lock));
   2542 
   2543 	if (mod_hash_find(zonehashbyname, (mod_hash_key_t)name, &hv) == 0)
   2544 		zone = (zone_t *)hv;
   2545 	return (zone);
   2546 }
   2547 
   2548 /*
   2549  * Public interface for looking up a zone by zoneid.  Only returns the zone if
   2550  * it is fully initialized, and has not yet begun the zone_destroy() sequence.
   2551  * Caller must call zone_rele() once it is done with the zone.
   2552  *
   2553  * The zone may begin the zone_destroy() sequence immediately after this
   2554  * function returns, but may be safely used until zone_rele() is called.
   2555  */
   2556 zone_t *
   2557 zone_find_by_id(zoneid_t zoneid)
   2558 {
   2559 	zone_t *zone;
   2560 	zone_status_t status;
   2561 
   2562 	mutex_enter(&zonehash_lock);
   2563 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
   2564 		mutex_exit(&zonehash_lock);
   2565 		return (NULL);
   2566 	}
   2567 	status = zone_status_get(zone);
   2568 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
   2569 		/*
   2570 		 * For all practical purposes the zone doesn't exist.
   2571 		 */
   2572 		mutex_exit(&zonehash_lock);
   2573 		return (NULL);
   2574 	}
   2575 	zone_hold(zone);
   2576 	mutex_exit(&zonehash_lock);
   2577 	return (zone);
   2578 }
   2579 
   2580 /*
   2581  * Similar to zone_find_by_id, but using zone label as the key.
   2582  */
   2583 zone_t *
   2584 zone_find_by_label(const ts_label_t *label)
   2585 {
   2586 	zone_t *zone;
   2587 	zone_status_t status;
   2588 
   2589 	mutex_enter(&zonehash_lock);
   2590 	if ((zone = zone_find_all_by_label(label)) == NULL) {
   2591 		mutex_exit(&zonehash_lock);
   2592 		return (NULL);
   2593 	}
   2594 
   2595 	status = zone_status_get(zone);
   2596 	if (status > ZONE_IS_DOWN) {
   2597 		/*
   2598 		 * For all practical purposes the zone doesn't exist.
   2599 		 */
   2600 		mutex_exit(&zonehash_lock);
   2601 		return (NULL);
   2602 	}
   2603 	zone_hold(zone);
   2604 	mutex_exit(&zonehash_lock);
   2605 	return (zone);
   2606 }
   2607 
   2608 /*
   2609  * Similar to zone_find_by_id, but using zone name as the key.
   2610  */
   2611 zone_t *
   2612 zone_find_by_name(char *name)
   2613 {
   2614 	zone_t *zone;
   2615 	zone_status_t status;
   2616 
   2617 	mutex_enter(&zonehash_lock);
   2618 	if ((zone = zone_find_all_by_name(name)) == NULL) {
   2619 		mutex_exit(&zonehash_lock);
   2620 		return (NULL);
   2621 	}
   2622 	status = zone_status_get(zone);
   2623 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
   2624 		/*
   2625 		 * For all practical purposes the zone doesn't exist.
   2626 		 */
   2627 		mutex_exit(&zonehash_lock);
   2628 		return (NULL);
   2629 	}
   2630 	zone_hold(zone);
   2631 	mutex_exit(&zonehash_lock);
   2632 	return (zone);
   2633 }
   2634 
   2635 /*
   2636  * Similar to zone_find_by_id(), using the path as a key.  For instance,
   2637  * if there is a zone "foo" rooted at /foo/root, and the path argument
   2638  * is "/foo/root/proc", it will return the held zone_t corresponding to
   2639  * zone "foo".
   2640  *
   2641  * zone_find_by_path() always returns a non-NULL value, since at the
   2642  * very least every path will be contained in the global zone.
   2643  *
   2644  * As with the other zone_find_by_*() functions, the caller is
   2645  * responsible for zone_rele()ing the return value of this function.
   2646  */
   2647 zone_t *
   2648 zone_find_by_path(const char *path)
   2649 {
   2650 	zone_t *zone;
   2651 	zone_t *zret = NULL;
   2652 	zone_status_t status;
   2653 
   2654 	if (path == NULL) {
   2655 		/*
   2656 		 * Call from rootconf().
   2657 		 */
   2658 		zone_hold(global_zone);
   2659 		return (global_zone);
   2660 	}
   2661 	ASSERT(*path == '/');
   2662 	mutex_enter(&zonehash_lock);
   2663 	for (zone = list_head(&zone_active); zone != NULL;
   2664 	    zone = list_next(&zone_active, zone)) {
   2665 		if (ZONE_PATH_VISIBLE(path, zone))
   2666 			zret = zone;
   2667 	}
   2668 	ASSERT(zret != NULL);
   2669 	status = zone_status_get(zret);
   2670 	if (status < ZONE_IS_READY || status > ZONE_IS_DOWN) {
   2671 		/*
   2672 		 * Zone practically doesn't exist.
   2673 		 */
   2674 		zret = global_zone;
   2675 	}
   2676 	zone_hold(zret);
   2677 	mutex_exit(&zonehash_lock);
   2678 	return (zret);
   2679 }
   2680 
   2681 /*
   2682  * Get the number of cpus visible to this zone.  The system-wide global
   2683  * 'ncpus' is returned if pools are disabled, the caller is in the
   2684  * global zone, or a NULL zone argument is passed in.
   2685  */
   2686 int
   2687 zone_ncpus_get(zone_t *zone)
   2688 {
   2689 	int myncpus = zone == NULL ? 0 : zone->zone_ncpus;
   2690 
   2691 	return (myncpus != 0 ? myncpus : ncpus);
   2692 }
   2693 
   2694 /*
   2695  * Get the number of online cpus visible to this zone.  The system-wide
   2696  * global 'ncpus_online' is returned if pools are disabled, the caller
   2697  * is in the global zone, or a NULL zone argument is passed in.
   2698  */
   2699 int
   2700 zone_ncpus_online_get(zone_t *zone)
   2701 {
   2702 	int myncpus_online = zone == NULL ? 0 : zone->zone_ncpus_online;
   2703 
   2704 	return (myncpus_online != 0 ? myncpus_online : ncpus_online);
   2705 }
   2706 
   2707 /*
   2708  * Return the pool to which the zone is currently bound.
   2709  */
   2710 pool_t *
   2711 zone_pool_get(zone_t *zone)
   2712 {
   2713 	ASSERT(pool_lock_held());
   2714 
   2715 	return (zone->zone_pool);
   2716 }
   2717 
   2718 /*
   2719  * Set the zone's pool pointer and update the zone's visibility to match
   2720  * the resources in the new pool.
   2721  */
   2722 void
   2723 zone_pool_set(zone_t *zone, pool_t *pool)
   2724 {
   2725 	ASSERT(pool_lock_held());
   2726 	ASSERT(MUTEX_HELD(&cpu_lock));
   2727 
   2728 	zone->zone_pool = pool;
   2729 	zone_pset_set(zone, pool->pool_pset->pset_id);
   2730 }
   2731 
   2732 /*
   2733  * Return the cached value of the id of the processor set to which the
   2734  * zone is currently bound.  The value will be ZONE_PS_INVAL if the pools
   2735  * facility is disabled.
   2736  */
   2737 psetid_t
   2738 zone_pset_get(zone_t *zone)
   2739 {
   2740 	ASSERT(MUTEX_HELD(&cpu_lock));
   2741 
   2742 	return (zone->zone_psetid);
   2743 }
   2744 
   2745 /*
   2746  * Set the cached value of the id of the processor set to which the zone
   2747  * is currently bound.  Also update the zone's visibility to match the
   2748  * resources in the new processor set.
   2749  */
   2750 void
   2751 zone_pset_set(zone_t *zone, psetid_t newpsetid)
   2752 {
   2753 	psetid_t oldpsetid;
   2754 
   2755 	ASSERT(MUTEX_HELD(&cpu_lock));
   2756 	oldpsetid = zone_pset_get(zone);
   2757 
   2758 	if (oldpsetid == newpsetid)
   2759 		return;
   2760 	/*
   2761 	 * Global zone sees all.
   2762 	 */
   2763 	if (zone != global_zone) {
   2764 		zone->zone_psetid = newpsetid;
   2765 		if (newpsetid != ZONE_PS_INVAL)
   2766 			pool_pset_visibility_add(newpsetid, zone);
   2767 		if (oldpsetid != ZONE_PS_INVAL)
   2768 			pool_pset_visibility_remove(oldpsetid, zone);
   2769 	}
   2770 	/*
   2771 	 * Disabling pools, so we should start using the global values
   2772 	 * for ncpus and ncpus_online.
   2773 	 */
   2774 	if (newpsetid == ZONE_PS_INVAL) {
   2775 		zone->zone_ncpus = 0;
   2776 		zone->zone_ncpus_online = 0;
   2777 	}
   2778 }
   2779 
   2780 /*
   2781  * Walk the list of active zones and issue the provided callback for
   2782  * each of them.
   2783  *
   2784  * Caller must not be holding any locks that may be acquired under
   2785  * zonehash_lock.  See comment at the beginning of the file for a list of
   2786  * common locks and their interactions with zones.
   2787  */
   2788 int
   2789 zone_walk(int (*cb)(zone_t *, void *), void *data)
   2790 {
   2791 	zone_t *zone;
   2792 	int ret = 0;
   2793 	zone_status_t status;
   2794 
   2795 	mutex_enter(&zonehash_lock);
   2796 	for (zone = list_head(&zone_active); zone != NULL;
   2797 	    zone = list_next(&zone_active, zone)) {
   2798 		/*
   2799 		 * Skip zones that shouldn't be externally visible.
   2800 		 */
   2801 		status = zone_status_get(zone);
   2802 		if (status < ZONE_IS_READY || status > ZONE_IS_DOWN)
   2803 			continue;
   2804 		/*
   2805 		 * Bail immediately if any callback invocation returns a
   2806 		 * non-zero value.
   2807 		 */
   2808 		ret = (*cb)(zone, data);
   2809 		if (ret != 0)
   2810 			break;
   2811 	}
   2812 	mutex_exit(&zonehash_lock);
   2813 	return (ret);
   2814 }
   2815 
   2816 static int
   2817 zone_set_root(zone_t *zone, const char *upath)
   2818 {
   2819 	vnode_t *vp;
   2820 	int trycount;
   2821 	int error = 0;
   2822 	char *path;
   2823 	struct pathname upn, pn;
   2824 	size_t pathlen;
   2825 
   2826 	if ((error = pn_get((char *)upath, UIO_USERSPACE, &upn)) != 0)
   2827 		return (error);
   2828 
   2829 	pn_alloc(&pn);
   2830 
   2831 	/* prevent infinite loop */
   2832 	trycount = 10;
   2833 	for (;;) {
   2834 		if (--trycount <= 0) {
   2835 			error = ESTALE;
   2836 			goto out;
   2837 		}
   2838 
   2839 		if ((error = lookuppn(&upn, &pn, FOLLOW, NULLVPP, &vp)) == 0) {
   2840 			/*
   2841 			 * VOP_ACCESS() may cover 'vp' with a new
   2842 			 * filesystem, if 'vp' is an autoFS vnode.
   2843 			 * Get the new 'vp' if so.
   2844 			 */
   2845 			if ((error =
   2846 			    VOP_ACCESS(vp, VEXEC, 0, CRED(), NULL)) == 0 &&
   2847 			    (!vn_ismntpt(vp) ||
   2848 			    (error = traverse(&vp)) == 0)) {
   2849 				pathlen = pn.pn_pathlen + 2;
   2850 				path = kmem_alloc(pathlen, KM_SLEEP);
   2851 				(void) strncpy(path, pn.pn_path,
   2852 				    pn.pn_pathlen + 1);
   2853 				path[pathlen - 2] = '/';
   2854 				path[pathlen - 1] = '\0';
   2855 				pn_free(&pn);
   2856 				pn_free(&upn);
   2857 
   2858 				/* Success! */
   2859 				break;
   2860 			}
   2861 			VN_RELE(vp);
   2862 		}
   2863 		if (error != ESTALE)
   2864 			goto out;
   2865 	}
   2866 
   2867 	ASSERT(error == 0);
   2868 	zone->zone_rootvp = vp;		/* we hold a reference to vp */
   2869 	zone->zone_rootpath = path;
   2870 	zone->zone_rootpathlen = pathlen;
   2871 	if (pathlen > 5 && strcmp(path + pathlen - 5, "/lu/") == 0)
   2872 		zone->zone_flags |= ZF_IS_SCRATCH;
   2873 	return (0);
   2874 
   2875 out:
   2876 	pn_free(&pn);
   2877 	pn_free(&upn);
   2878 	return (error);
   2879 }
   2880 
   2881 #define	isalnum(c)	(((c) >= '0' && (c) <= '9') || \
   2882 			((c) >= 'a' && (c) <= 'z') || \
   2883 			((c) >= 'A' && (c) <= 'Z'))
   2884 
   2885 static int
   2886 zone_set_name(zone_t *zone, const char *uname)
   2887 {
   2888 	char *kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
   2889 	size_t len;
   2890 	int i, err;
   2891 
   2892 	if ((err = copyinstr(uname, kname, ZONENAME_MAX, &len)) != 0) {
   2893 		kmem_free(kname, ZONENAME_MAX);
   2894 		return (err);	/* EFAULT or ENAMETOOLONG */
   2895 	}
   2896 
   2897 	/* must be less than ZONENAME_MAX */
   2898 	if (len == ZONENAME_MAX && kname[ZONENAME_MAX - 1] != '\0') {
   2899 		kmem_free(kname, ZONENAME_MAX);
   2900 		return (EINVAL);
   2901 	}
   2902 
   2903 	/*
   2904 	 * Name must start with an alphanumeric and must contain only
   2905 	 * alphanumerics, '-', '_' and '.'.
   2906 	 */
   2907 	if (!isalnum(kname[0])) {
   2908 		kmem_free(kname, ZONENAME_MAX);
   2909 		return (EINVAL);
   2910 	}
   2911 	for (i = 1; i < len - 1; i++) {
   2912 		if (!isalnum(kname[i]) && kname[i] != '-' && kname[i] != '_' &&
   2913 		    kname[i] != '.') {
   2914 			kmem_free(kname, ZONENAME_MAX);
   2915 			return (EINVAL);
   2916 		}
   2917 	}
   2918 
   2919 	zone->zone_name = kname;
   2920 	return (0);
   2921 }
   2922 
   2923 /*
   2924  * Gets the 32-bit hostid of the specified zone as an unsigned int.  If 'zonep'
   2925  * is NULL or it points to a zone with no hostid emulation, then the machine's
   2926  * hostid (i.e., the global zone's hostid) is returned.  This function returns
   2927  * zero if neither the zone nor the host machine (global zone) have hostids.  It
   2928  * returns HW_INVALID_HOSTID if the function attempts to return the machine's
   2929  * hostid and the machine's hostid is invalid.
   2930  */
   2931 uint32_t
   2932 zone_get_hostid(zone_t *zonep)
   2933 {
   2934 	unsigned long machine_hostid;
   2935 
   2936 	if (zonep == NULL || zonep->zone_hostid == HW_INVALID_HOSTID) {
   2937 		if (ddi_strtoul(hw_serial, NULL, 10, &machine_hostid) != 0)
   2938 			return (HW_INVALID_HOSTID);
   2939 		return ((uint32_t)machine_hostid);
   2940 	}
   2941 	return (zonep->zone_hostid);
   2942 }
   2943 
   2944 /*
   2945  * Similar to thread_create(), but makes sure the thread is in the appropriate
   2946  * zone's zsched process (curproc->p_zone->zone_zsched) before returning.
   2947  */
   2948 /*ARGSUSED*/
   2949 kthread_t *
   2950 zthread_create(
   2951     caddr_t stk,
   2952     size_t stksize,
   2953     void (*proc)(),
   2954     void *arg,
   2955     size_t len,
   2956     pri_t pri)
   2957 {
   2958 	kthread_t *t;
   2959 	zone_t *zone = curproc->p_zone;
   2960 	proc_t *pp = zone->zone_zsched;
   2961 
   2962 	zone_hold(zone);	/* Reference to be dropped when thread exits */
   2963 
   2964 	/*
   2965 	 * No-one should be trying to create threads if the zone is shutting
   2966 	 * down and there aren't any kernel threads around.  See comment
   2967 	 * in zthread_exit().
   2968 	 */
   2969 	ASSERT(!(zone->zone_kthreads == NULL &&
   2970 	    zone_status_get(zone) >= ZONE_IS_EMPTY));
   2971 	/*
   2972 	 * Create a thread, but don't let it run until we've finished setting
   2973 	 * things up.
   2974 	 */
   2975 	t = thread_create(stk, stksize, proc, arg, len, pp, TS_STOPPED, pri);
   2976 	ASSERT(t->t_forw == NULL);
   2977 	mutex_enter(&zone_status_lock);
   2978 	if (zone->zone_kthreads == NULL) {
   2979 		t->t_forw = t->t_back = t;
   2980 	} else {
   2981 		kthread_t *tx = zone->zone_kthreads;
   2982 
   2983 		t->t_forw = tx;
   2984 		t->t_back = tx->t_back;
   2985 		tx->t_back->t_forw = t;
   2986 		tx->t_back = t;
   2987 	}
   2988 	zone->zone_kthreads = t;
   2989 	mutex_exit(&zone_status_lock);
   2990 
   2991 	mutex_enter(&pp->p_lock);
   2992 	t->t_proc_flag |= TP_ZTHREAD;
   2993 	project_rele(t->t_proj);
   2994 	t->t_proj = project_hold(pp->p_task->tk_proj);
   2995 
   2996 	/*
   2997 	 * Setup complete, let it run.
   2998 	 */
   2999 	thread_lock(t);
   3000 	t->t_schedflag |= TS_ALLSTART;
   3001 	setrun_locked(t);
   3002 	thread_unlock(t);
   3003 
   3004 	mutex_exit(&pp->p_lock);
   3005 
   3006 	return (t);
   3007 }
   3008 
   3009 /*
   3010  * Similar to thread_exit().  Must be called by threads created via
   3011  * zthread_exit().
   3012  */
   3013 void
   3014 zthread_exit(void)
   3015 {
   3016 	kthread_t *t = curthread;
   3017 	proc_t *pp = curproc;
   3018 	zone_t *zone = pp->p_zone;
   3019 
   3020 	mutex_enter(&zone_status_lock);
   3021 
   3022 	/*
   3023 	 * Reparent to p0
   3024 	 */
   3025 	kpreempt_disable();
   3026 	mutex_enter(&pp->p_lock);
   3027 	t->t_proc_flag &= ~TP_ZTHREAD;
   3028 	t->t_procp = &p0;
   3029 	hat_thread_exit(t);
   3030 	mutex_exit(&pp->p_lock);
   3031 	kpreempt_enable();
   3032 
   3033 	if (t->t_back == t) {
   3034 		ASSERT(t->t_forw == t);
   3035 		/*
   3036 		 * If the zone is empty, once the thread count
   3037 		 * goes to zero no further kernel threads can be
   3038 		 * created.  This is because if the creator is a process
   3039 		 * in the zone, then it must have exited before the zone
   3040 		 * state could be set to ZONE_IS_EMPTY.
   3041 		 * Otherwise, if the creator is a kernel thread in the
   3042 		 * zone, the thread count is non-zero.
   3043 		 *
   3044 		 * This really means that non-zone kernel threads should
   3045 		 * not create zone kernel threads.
   3046 		 */
   3047 		zone->zone_kthreads = NULL;
   3048 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
   3049 			zone_status_set(zone, ZONE_IS_DOWN);
   3050 			/*
   3051 			 * Remove any CPU caps on this zone.
   3052 			 */
   3053 			cpucaps_zone_remove(zone);
   3054 		}
   3055 	} else {
   3056 		t->t_forw->t_back = t->t_back;
   3057 		t->t_back->t_forw = t->t_forw;
   3058 		if (zone->zone_kthreads == t)
   3059 			zone->zone_kthreads = t->t_forw;
   3060 	}
   3061 	mutex_exit(&zone_status_lock);
   3062 	zone_rele(zone);
   3063 	thread_exit();
   3064 	/* NOTREACHED */
   3065 }
   3066 
   3067 static void
   3068 zone_chdir(vnode_t *vp, vnode_t **vpp, proc_t *pp)
   3069 {
   3070 	vnode_t *oldvp;
   3071 
   3072 	/* we're going to hold a reference here to the directory */
   3073 	VN_HOLD(vp);
   3074 
   3075 	if (audit_active)	/* update abs cwd/root path see c2audit.c */
   3076 		audit_chdirec(vp, vpp);
   3077 
   3078 	mutex_enter(&pp->p_lock);
   3079 	oldvp = *vpp;
   3080 	*vpp = vp;
   3081 	mutex_exit(&pp->p_lock);
   3082 	if (oldvp != NULL)
   3083 		VN_RELE(oldvp);
   3084 }
   3085 
   3086 /*
   3087  * Convert an rctl value represented by an nvlist_t into an rctl_val_t.
   3088  */
   3089 static int
   3090 nvlist2rctlval(nvlist_t *nvl, rctl_val_t *rv)
   3091 {
   3092 	nvpair_t *nvp = NULL;
   3093 	boolean_t priv_set = B_FALSE;
   3094 	boolean_t limit_set = B_FALSE;
   3095 	boolean_t action_set = B_FALSE;
   3096 
   3097 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
   3098 		const char *name;
   3099 		uint64_t ui64;
   3100 
   3101 		name = nvpair_name(nvp);
   3102 		if (nvpair_type(nvp) != DATA_TYPE_UINT64)
   3103 			return (EINVAL);
   3104 		(void) nvpair_value_uint64(nvp, &ui64);
   3105 		if (strcmp(name, "privilege") == 0) {
   3106 			/*
   3107 			 * Currently only privileged values are allowed, but
   3108 			 * this may change in the future.
   3109 			 */
   3110 			if (ui64 != RCPRIV_PRIVILEGED)
   3111 				return (EINVAL);
   3112 			rv->rcv_privilege = ui64;
   3113 			priv_set = B_TRUE;
   3114 		} else if (strcmp(name, "limit") == 0) {
   3115 			rv->rcv_value = ui64;
   3116 			limit_set = B_TRUE;
   3117 		} else if (strcmp(name, "action") == 0) {
   3118 			if (ui64 != RCTL_LOCAL_NOACTION &&
   3119 			    ui64 != RCTL_LOCAL_DENY)
   3120 				return (EINVAL);
   3121 			rv->rcv_flagaction = ui64;
   3122 			action_set = B_TRUE;
   3123 		} else {
   3124 			return (EINVAL);
   3125 		}
   3126 	}
   3127 
   3128 	if (!(priv_set && limit_set && action_set))
   3129 		return (EINVAL);
   3130 	rv->rcv_action_signal = 0;
   3131 	rv->rcv_action_recipient = NULL;
   3132 	rv->rcv_action_recip_pid = -1;
   3133 	rv->rcv_firing_time = 0;
   3134 
   3135 	return (0);
   3136 }
   3137 
   3138 /*
   3139  * Non-global zone version of start_init.
   3140  */
   3141 void
   3142 zone_start_init(void)
   3143 {
   3144 	proc_t *p = ttoproc(curthread);
   3145 	zone_t *z = p->p_zone;
   3146 
   3147 	ASSERT(!INGLOBALZONE(curproc));
   3148 
   3149 	/*
   3150 	 * For all purposes (ZONE_ATTR_INITPID and restart_init),
   3151 	 * storing just the pid of init is sufficient.
   3152 	 */
   3153 	z->zone_proc_initpid = p->p_pid;
   3154 
   3155 	/*
   3156 	 * We maintain zone_boot_err so that we can return the cause of the
   3157 	 * failure back to the caller of the zone_boot syscall.
   3158 	 */
   3159 	p->p_zone->zone_boot_err = start_init_common();
   3160 
   3161 	/*
   3162 	 * We will prevent booting zones from becoming running zones if the
   3163 	 * global zone is shutting down.
   3164 	 */
   3165 	mutex_enter(&zone_status_lock);
   3166 	if (z->zone_boot_err != 0 || zone_status_get(global_zone) >=
   3167 	    ZONE_IS_SHUTTING_DOWN) {
   3168 		/*
   3169 		 * Make sure we are still in the booting state-- we could have
   3170 		 * raced and already be shutting down, or even further along.
   3171 		 */
   3172 		if (zone_status_get(z) == ZONE_IS_BOOTING) {
   3173 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
   3174 		}
   3175 		mutex_exit(&zone_status_lock);
   3176 		/* It's gone bad, dispose of the process */
   3177 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
   3178 			mutex_enter(&p->p_lock);
   3179 			ASSERT(p->p_flag & SEXITLWPS);
   3180 			lwp_exit();
   3181 		}
   3182 	} else {
   3183 		if (zone_status_get(z) == ZONE_IS_BOOTING)
   3184 			zone_status_set(z, ZONE_IS_RUNNING);
   3185 		mutex_exit(&zone_status_lock);
   3186 		/* cause the process to return to userland. */
   3187 		lwp_rtt();
   3188 	}
   3189 }
   3190 
   3191 struct zsched_arg {
   3192 	zone_t *zone;
   3193 	nvlist_t *nvlist;
   3194 };
   3195 
   3196 /*
   3197  * Per-zone "sched" workalike.  The similarity to "sched" doesn't have
   3198  * anything to do with scheduling, but rather with the fact that
   3199  * per-zone kernel threads are parented to zsched, just like regular
   3200  * kernel threads are parented to sched (p0).
   3201  *
   3202  * zsched is also responsible for launching init for the zone.
   3203  */
   3204 static void
   3205 zsched(void *arg)
   3206 {
   3207 	struct zsched_arg *za = arg;
   3208 	proc_t *pp = curproc;
   3209 	proc_t *initp = proc_init;
   3210 	zone_t *zone = za->zone;
   3211 	cred_t *cr, *oldcred;
   3212 	rctl_set_t *set;
   3213 	rctl_alloc_gp_t *gp;
   3214 	contract_t *ct = NULL;
   3215 	task_t *tk, *oldtk;
   3216 	rctl_entity_p_t e;
   3217 	kproject_t *pj;
   3218 
   3219 	nvlist_t *nvl = za->nvlist;
   3220 	nvpair_t *nvp = NULL;
   3221 
   3222 	bcopy("zsched", PTOU(pp)->u_psargs, sizeof ("zsched"));
   3223 	bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched"));
   3224 	PTOU(pp)->u_argc = 0;
   3225 	PTOU(pp)->u_argv = NULL;
   3226 	PTOU(pp)->u_envp = NULL;
   3227 	closeall(P_FINFO(pp));
   3228 
   3229 	/*
   3230 	 * We are this zone's "zsched" process.  As the zone isn't generally
   3231 	 * visible yet we don't need to grab any locks before initializing its
   3232 	 * zone_proc pointer.
   3233 	 */
   3234 	zone_hold(zone);  /* this hold is released by zone_destroy() */
   3235 	zone->zone_zsched = pp;
   3236 	mutex_enter(&pp->p_lock);
   3237 	pp->p_zone = zone;
   3238 	mutex_exit(&pp->p_lock);
   3239 
   3240 	/*
   3241 	 * Disassociate process from its 'parent'; parent ourselves to init
   3242 	 * (pid 1) and change other values as needed.
   3243 	 */
   3244 	sess_create();
   3245 
   3246 	mutex_enter(&pidlock);
   3247 	proc_detach(pp);
   3248 	pp->p_ppid = 1;
   3249 	pp->p_flag |= SZONETOP;
   3250 	pp->p_ancpid = 1;
   3251 	pp->p_parent = initp;
   3252 	pp->p_psibling = NULL;
   3253 	if (initp->p_child)
   3254 		initp->p_child->p_psibling = pp;
   3255 	pp->p_sibling = initp->p_child;
   3256 	initp->p_child = pp;
   3257 
   3258 	/* Decrement what newproc() incremented. */
   3259 	upcount_dec(crgetruid(CRED()), GLOBAL_ZONEID);
   3260 	/*
   3261 	 * Our credentials are about to become kcred-like, so we don't care
   3262 	 * about the caller's ruid.
   3263 	 */
   3264 	upcount_inc(crgetruid(kcred), zone->zone_id);
   3265 	mutex_exit(&pidlock);
   3266 
   3267 	/*
   3268 	 * getting out of global zone, so decrement lwp counts
   3269 	 */
   3270 	pj = pp->p_task->tk_proj;
   3271 	mutex_enter(&global_zone->zone_nlwps_lock);
   3272 	pj->kpj_nlwps -= pp->p_lwpcnt;
   3273 	global_zone->zone_nlwps -= pp->p_lwpcnt;
   3274 	mutex_exit(&global_zone->zone_nlwps_lock);
   3275 
   3276 	/*
   3277 	 * Decrement locked memory counts on old zone and project.
   3278 	 */
   3279 	mutex_enter(&global_zone->zone_mem_lock);
   3280 	global_zone->zone_locked_mem -= pp->p_locked_mem;
   3281 	pj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
   3282 	mutex_exit(&global_zone->zone_mem_lock);
   3283 
   3284 	/*
   3285 	 * Create and join a new task in project '0' of this zone.
   3286 	 *
   3287 	 * We don't need to call holdlwps() since we know we're the only lwp in
   3288 	 * this process.
   3289 	 *
   3290 	 * task_join() returns with p_lock held.
   3291 	 */
   3292 	tk = task_create(0, zone);
   3293 	mutex_enter(&cpu_lock);
   3294 	oldtk = task_join(tk, 0);
   3295 
   3296 	pj = pp->p_task->tk_proj;
   3297 
   3298 	mutex_enter(&zone->zone_mem_lock);
   3299 	zone->zone_locked_mem += pp->p_locked_mem;
   3300 	pj->kpj_data.kpd_locked_mem += pp->p_locked_mem;
   3301 	mutex_exit(&zone->zone_mem_lock);
   3302 
   3303 	/*
   3304 	 * add lwp counts to zsched's zone, and increment project's task count
   3305 	 * due to the task created in the above tasksys_settaskid
   3306 	 */
   3307 
   3308 	mutex_enter(&zone->zone_nlwps_lock);
   3309 	pj->kpj_nlwps += pp->p_lwpcnt;
   3310 	pj->kpj_ntasks += 1;
   3311 	zone->zone_nlwps += pp->p_lwpcnt;
   3312 	mutex_exit(&zone->zone_nlwps_lock);
   3313 
   3314 	mutex_exit(&curproc->p_lock);
   3315 	mutex_exit(&cpu_lock);
   3316 	task_rele(oldtk);
   3317 
   3318 	/*
   3319 	 * The process was created by a process in the global zone, hence the
   3320 	 * credentials are wrong.  We might as well have kcred-ish credentials.
   3321 	 */
   3322 	cr = zone->zone_kcred;
   3323 	crhold(cr);
   3324 	mutex_enter(&pp->p_crlock);
   3325 	oldcred = pp->p_cred;
   3326 	pp->p_cred = cr;
   3327 	mutex_exit(&pp->p_crlock);
   3328 	crfree(oldcred);
   3329 
   3330 	/*
   3331 	 * Hold credentials again (for thread)
   3332 	 */
   3333 	crhold(cr);
   3334 
   3335 	/*
   3336 	 * p_lwpcnt can't change since this is a kernel process.
   3337 	 */
   3338 	crset(pp, cr);
   3339 
   3340 	/*
   3341 	 * Chroot
   3342 	 */
   3343 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_cdir, pp);
   3344 	zone_chdir(zone->zone_rootvp, &PTOU(pp)->u_rdir, pp);
   3345 
   3346 	/*
   3347 	 * Initialize zone's rctl set.
   3348 	 */
   3349 	set = rctl_set_create();
   3350 	gp = rctl_set_init_prealloc(RCENTITY_ZONE);
   3351 	mutex_enter(&pp->p_lock);
   3352 	e.rcep_p.zone = zone;
   3353 	e.rcep_t = RCENTITY_ZONE;
   3354 	zone->zone_rctls = rctl_set_init(RCENTITY_ZONE, pp, &e, set, gp);
   3355 	mutex_exit(&pp->p_lock);
   3356 	rctl_prealloc_destroy(gp);
   3357 
   3358 	/*
   3359 	 * Apply the rctls passed in to zone_create().  This is basically a list
   3360 	 * assignment: all of the old values are removed and the new ones
   3361 	 * inserted.  That is, if an empty list is passed in, all values are
   3362 	 * removed.
   3363 	 */
   3364 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
   3365 		rctl_dict_entry_t *rde;
   3366 		rctl_hndl_t hndl;
   3367 		char *name;
   3368 		nvlist_t **nvlarray;
   3369 		uint_t i, nelem;
   3370 		int error;	/* For ASSERT()s */
   3371 
   3372 		name = nvpair_name(nvp);
   3373 		hndl = rctl_hndl_lookup(name);
   3374 		ASSERT(hndl != -1);
   3375 		rde = rctl_dict_lookup_hndl(hndl);
   3376 		ASSERT(rde != NULL);
   3377 
   3378 		for (; /* ever */; ) {
   3379 			rctl_val_t oval;
   3380 
   3381 			mutex_enter(&pp->p_lock);
   3382 			error = rctl_local_get(hndl, NULL, &oval, pp);
   3383 			mutex_exit(&pp->p_lock);
   3384 			ASSERT(error == 0);	/* Can't fail for RCTL_FIRST */
   3385 			ASSERT(oval.rcv_privilege != RCPRIV_BASIC);
   3386 			if (oval.rcv_privilege == RCPRIV_SYSTEM)
   3387 				break;
   3388 			mutex_enter(&pp->p_lock);
   3389 			error = rctl_local_delete(hndl, &oval, pp);
   3390 			mutex_exit(&pp->p_lock);
   3391 			ASSERT(error == 0);
   3392 		}
   3393 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
   3394 		ASSERT(error == 0);
   3395 		for (i = 0; i < nelem; i++) {
   3396 			rctl_val_t *nvalp;
   3397 
   3398 			nvalp = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
   3399 			error = nvlist2rctlval(nvlarray[i], nvalp);
   3400 			ASSERT(error == 0);
   3401 			/*
   3402 			 * rctl_local_insert can fail if the value being
   3403 			 * inserted is a duplicate; this is OK.
   3404 			 */
   3405 			mutex_enter(&pp->p_lock);
   3406 			if (rctl_local_insert(hndl, nvalp, pp) != 0)
   3407 				kmem_cache_free(rctl_val_cache, nvalp);
   3408 			mutex_exit(&pp->p_lock);
   3409 		}
   3410 	}
   3411 	/*
   3412 	 * Tell the world that we're done setting up.
   3413 	 *
   3414 	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
   3415 	 * and atomically set the zone's processor set visibility.  Once
   3416 	 * we drop pool_lock() this zone will automatically get updated
   3417 	 * to reflect any future changes to the pools configuration.
   3418 	 *
   3419 	 * Note that after we drop the locks below (zonehash_lock in
   3420 	 * particular) other operations such as a zone_getattr call can
   3421 	 * now proceed and observe the zone. That is the reason for doing a
   3422 	 * state transition to the INITIALIZED state.
   3423 	 */
   3424 	pool_lock();
   3425 	mutex_enter(&cpu_lock);
   3426 	mutex_enter(&zonehash_lock);
   3427 	zone_uniqid(zone);
   3428 	zone_zsd_configure(zone);
   3429 	if (pool_state == POOL_ENABLED)
   3430 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
   3431 	mutex_enter(&zone_status_lock);
   3432 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
   3433 	zone_status_set(zone, ZONE_IS_INITIALIZED);
   3434 	mutex_exit(&zone_status_lock);
   3435 	mutex_exit(&zonehash_lock);
   3436 	mutex_exit(&cpu_lock);
   3437 	pool_unlock();
   3438 
   3439 	/* Now call the create callback for this key */
   3440 	zsd_apply_all_keys(zsd_apply_create, zone);
   3441 
   3442 	/* The callbacks are complete. Mark ZONE_IS_READY */
   3443 	mutex_enter(&zone_status_lock);
   3444 	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
   3445 	zone_status_set(zone, ZONE_IS_READY);
   3446 	mutex_exit(&zone_status_lock);
   3447 
   3448 	/*
   3449 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
   3450 	 * we launch init, and set the state to running.
   3451 	 */
   3452 	zone_status_wait_cpr(zone, ZONE_IS_BOOTING, "zsched");
   3453 
   3454 	if (zone_status_get(zone) == ZONE_IS_BOOTING) {
   3455 		id_t cid;
   3456 
   3457 		/*
   3458 		 * Ok, this is a little complicated.  We need to grab the
   3459 		 * zone's pool's scheduling class ID; note that by now, we
   3460 		 * are already bound to a pool if we need to be (zoneadmd
   3461 		 * will have done that to us while we're in the READY
   3462 		 * state).  *But* the scheduling class for the zone's 'init'
   3463 		 * must be explicitly passed to newproc, which doesn't
   3464 		 * respect pool bindings.
   3465 		 *
   3466 		 * We hold the pool_lock across the call to newproc() to
   3467 		 * close the obvious race: the pool's scheduling class
   3468 		 * could change before we manage to create the LWP with
   3469 		 * classid 'cid'.
   3470 		 */
   3471 		pool_lock();
   3472 		if (zone->zone_defaultcid > 0)
   3473 			cid = zone->zone_defaultcid;
   3474 		else
   3475 			cid = pool_get_class(zone->zone_pool);
   3476 		if (cid == -1)
   3477 			cid = defaultcid;
   3478 
   3479 		/*
   3480 		 * If this fails, zone_boot will ultimately fail.  The
   3481 		 * state of the zone will be set to SHUTTING_DOWN-- userland
   3482 		 * will have to tear down the zone, and fail, or try again.
   3483 		 */
   3484 		if ((zone->zone_boot_err = newproc(zone_start_init, NULL, cid,
   3485 		    minclsyspri - 1, &ct)) != 0) {
   3486 			mutex_enter(&zone_status_lock);
   3487 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
   3488 			mutex_exit(&zone_status_lock);
   3489 		}
   3490 		pool_unlock();
   3491 	}
   3492 
   3493 	/*
   3494 	 * Wait for zone_destroy() to be called.  This is what we spend
   3495 	 * most of our life doing.
   3496 	 */
   3497 	zone_status_wait_cpr(zone, ZONE_IS_DYING, "zsched");
   3498 
   3499 	if (ct)
   3500 		/*
   3501 		 * At this point the process contract should be empty.
   3502 		 * (Though if it isn't, it's not the end of the world.)
   3503 		 */
   3504 		VERIFY(contract_abandon(ct, curproc, B_TRUE) == 0);
   3505 
   3506 	/*
   3507 	 * Allow kcred to be freed when all referring processes
   3508 	 * (including this one) go away.  We can't just do this in
   3509 	 * zone_free because we need to wait for the zone_cred_ref to
   3510 	 * drop to 0 before calling zone_free, and the existence of
   3511 	 * zone_kcred will prevent that.  Thus, we call crfree here to
   3512 	 * balance the crdup in zone_create.  The crhold calls earlier
   3513 	 * in zsched will be dropped when the thread and process exit.
   3514 	 */
   3515 	crfree(zone->zone_kcred);
   3516 	zone->zone_kcred = NULL;
   3517 
   3518 	exit(CLD_EXITED, 0);
   3519 }
   3520 
   3521 /*
   3522  * Helper function to determine if there are any submounts of the
   3523  * provided path.  Used to make sure the zone doesn't "inherit" any
   3524  * mounts from before it is created.
   3525  */
   3526 static uint_t
   3527 zone_mount_count(const char *rootpath)
   3528 {
   3529 	vfs_t *vfsp;
   3530 	uint_t count = 0;
   3531 	size_t rootpathlen = strlen(rootpath);
   3532 
   3533 	/*
   3534 	 * Holding zonehash_lock prevents race conditions with
   3535 	 * vfs_list_add()/vfs_list_remove() since we serialize with
   3536 	 * zone_find_by_path().
   3537 	 */
   3538 	ASSERT(MUTEX_HELD(&zonehash_lock));
   3539 	/*
   3540 	 * The rootpath must end with a '/'
   3541 	 */
   3542 	ASSERT(rootpath[rootpathlen - 1] == '/');
   3543 
   3544 	/*
   3545 	 * This intentionally does not count the rootpath itself if that
   3546 	 * happens to be a mount point.
   3547 	 */
   3548 	vfs_list_read_lock();
   3549 	vfsp = rootvfs;
   3550 	do {
   3551 		if (strncmp(rootpath, refstr_value(vfsp->vfs_mntpt),
   3552 		    rootpathlen) == 0)
   3553 			count++;
   3554 		vfsp = vfsp->vfs_next;
   3555 	} while (vfsp != rootvfs);
   3556 	vfs_list_unlock();
   3557 	return (count);
   3558 }
   3559 
   3560 /*
   3561  * Helper function to make sure that a zone created on 'rootpath'
   3562  * wouldn't end up containing other zones' rootpaths.
   3563  */
   3564 static boolean_t
   3565 zone_is_nested(const char *rootpath)
   3566 {
   3567 	zone_t *zone;
   3568 	size_t rootpathlen = strlen(rootpath);
   3569 	size_t len;
   3570 
   3571 	ASSERT(MUTEX_HELD(&zonehash_lock));
   3572 
   3573 	/*
   3574 	 * zone_set_root() appended '/' and '\0' at the end of rootpath
   3575 	 */
   3576 	if ((rootpathlen <= 3) && (rootpath[0] == '/') &&
   3577 	    (rootpath[1] == '/') && (rootpath[2] == '\0'))
   3578 		return (B_TRUE);
   3579 
   3580 	for (zone = list_head(&zone_active); zone != NULL;
   3581 	    zone = list_next(&zone_active, zone)) {
   3582 		if (zone == global_zone)
   3583 			continue;
   3584 		len = strlen(zone->zone_rootpath);
   3585 		if (strncmp(rootpath, zone->zone_rootpath,
   3586 		    MIN(rootpathlen, len)) == 0)
   3587 			return (B_TRUE);
   3588 	}
   3589 	return (B_FALSE);
   3590 }
   3591 
   3592 static int
   3593 zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
   3594     size_t zone_privssz)
   3595 {
   3596 	priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
   3597 
   3598 	if (zone_privssz < sizeof (priv_set_t))
   3599 		return (set_errno(ENOMEM));
   3600 
   3601 	if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
   3602 		kmem_free(privs, sizeof (priv_set_t));
   3603 		return (EFAULT);
   3604 	}
   3605 
   3606 	zone->zone_privset = privs;
   3607 	return (0);
   3608 }
   3609 
   3610 /*
   3611  * We make creative use of nvlists to pass in rctls from userland.  The list is
   3612  * a list of the following structures:
   3613  *
   3614  * (name = rctl_name, value = nvpair_list_array)
   3615  *
   3616  * Where each element of the nvpair_list_array is of the form:
   3617  *
   3618  * [(name = "privilege", value = RCPRIV_PRIVILEGED),
   3619  * 	(name = "limit", value = uint64_t),
   3620  * 	(name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
   3621  */
   3622 static int
   3623 parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
   3624 {
   3625 	nvpair_t *nvp = NULL;
   3626 	nvlist_t *nvl = NULL;
   3627 	char *kbuf;
   3628 	int error;
   3629 	rctl_val_t rv;
   3630 
   3631 	*nvlp = NULL;
   3632 
   3633 	if (buflen == 0)
   3634 		return (0);
   3635 
   3636 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
   3637 		return (ENOMEM);
   3638 	if (copyin(ubuf, kbuf, buflen)) {
   3639 		error = EFAULT;
   3640 		goto out;
   3641 	}
   3642 	if (nvlist_unpack(kbuf, buflen, &nvl, KM_SLEEP) != 0) {
   3643 		/*
   3644 		 * nvl may have been allocated/free'd, but the value set to
   3645 		 * non-NULL, so we reset it here.
   3646 		 */
   3647 		nvl = NULL;
   3648 		error = EINVAL;
   3649 		goto out;
   3650 	}
   3651 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
   3652 		rctl_dict_entry_t *rde;
   3653 		rctl_hndl_t hndl;
   3654 		nvlist_t **nvlarray;
   3655 		uint_t i, nelem;
   3656 		char *name;
   3657 
   3658 		error = EINVAL;
   3659 		name = nvpair_name(nvp);
   3660 		if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1)
   3661 		    != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) {
   3662 			goto out;
   3663 		}
   3664 		if ((hndl = rctl_hndl_lookup(name)) == -1) {
   3665 			goto out;
   3666 		}
   3667 		rde = rctl_dict_lookup_hndl(hndl);
   3668 		error = nvpair_value_nvlist_array(nvp, &nvlarray, &nelem);
   3669 		ASSERT(error == 0);
   3670 		for (i = 0; i < nelem; i++) {
   3671 			if (error = nvlist2rctlval(nvlarray[i], &rv))
   3672 				goto out;
   3673 		}
   3674 		if (rctl_invalid_value(rde, &rv)) {
   3675 			error = EINVAL;
   3676 			goto out;
   3677 		}
   3678 	}
   3679 	error = 0;
   3680 	*nvlp = nvl;
   3681 out:
   3682 	kmem_free(kbuf, buflen);
   3683 	if (error && nvl != NULL)
   3684 		nvlist_free(nvl);
   3685 	return (error);
   3686 }
   3687 
   3688 int
   3689 zone_create_error(int er_error, int er_ext, int *er_out) {
   3690 	if (er_out != NULL) {
   3691 		if (copyout(&er_ext, er_out, sizeof (int))) {
   3692 			return (set_errno(EFAULT));
   3693 		}
   3694 	}
   3695 	return (set_errno(er_error));
   3696 }
   3697 
   3698 static int
   3699 zone_set_label(zone_t *zone, const bslabel_t *lab, uint32_t doi)
   3700 {
   3701 	ts_label_t *tsl;
   3702 	bslabel_t blab;
   3703 
   3704 	/* Get label from user */
   3705 	if (copyin(lab, &blab, sizeof (blab)) != 0)
   3706 		return (EFAULT);
   3707 	tsl = labelalloc(&blab, doi, KM_NOSLEEP);
   3708 	if (tsl == NULL)
   3709 		return (ENOMEM);
   3710 
   3711 	zone->zone_slabel = tsl;
   3712 	return (0);
   3713 }
   3714 
   3715 /*
   3716  * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
   3717  */
   3718 static int
   3719 parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
   3720 {
   3721 	char *kbuf;
   3722 	char *dataset, *next;
   3723 	zone_dataset_t *zd;
   3724 	size_t len;
   3725 
   3726 	if (ubuf == NULL || buflen == 0)
   3727 		return (0);
   3728 
   3729 	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
   3730 		return (ENOMEM);
   3731 
   3732 	if (copyin(ubuf, kbuf, buflen) != 0) {
   3733 		kmem_free(kbuf, buflen);
   3734 		return (EFAULT);
   3735 	}
   3736 
   3737 	dataset = next = kbuf;
   3738 	for (;;) {
   3739 		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
   3740 
   3741 		next = strchr(dataset, ',');
   3742 
   3743 		if (next == NULL)
   3744 			len = strlen(dataset);
   3745 		else
   3746 			len = next - dataset;
   3747 
   3748 		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
   3749 		bcopy(dataset, zd->zd_dataset, len);
   3750 		zd->zd_dataset[len] = '\0';
   3751 
   3752 		list_insert_head(&zone->zone_datasets, zd);
   3753 
   3754 		if (next == NULL)
   3755 			break;
   3756 
   3757 		dataset = next + 1;
   3758 	}
   3759 
   3760 	kmem_free(kbuf, buflen);
   3761 	return (0);
   3762 }
   3763 
   3764 /*
   3765  * System call to create/initialize a new zone named 'zone_name', rooted
   3766  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
   3767  * and initialized with the zone-wide rctls described in 'rctlbuf', and
   3768  * with labeling set by 'match', 'doi', and 'label'.
   3769  *
   3770  * If extended error is non-null, we may use it to return more detailed
   3771  * error information.
   3772  */
   3773 static zoneid_t
   3774 zone_create(const char *zone_name, const char *zone_root,
   3775     const priv_set_t *zone_privs, size_t zone_privssz,
   3776     caddr_t rctlbuf, size_t rctlbufsz,
   3777     caddr_t zfsbuf, size_t zfsbufsz, int *extended_error,
   3778     int match, uint32_t doi, const bslabel_t *label,
   3779     int flags)
   3780 {
   3781 	struct zsched_arg zarg;
   3782 	nvlist_t *rctls = NULL;
   3783 	proc_t *pp = curproc;
   3784 	zone_t *zone, *ztmp;
   3785 	zoneid_t zoneid;
   3786 	int error;
   3787 	int error2 = 0;
   3788 	char *str;
   3789 	cred_t *zkcr;
   3790 	boolean_t insert_label_hash;
   3791 
   3792 	if (secpolicy_zone_config(CRED()) != 0)
   3793 		return (set_errno(EPERM));
   3794 
   3795 	/* can't boot zone from within chroot environment */
   3796 	if (PTOU(pp)->u_rdir != NULL && PTOU(pp)->u_rdir != rootdir)
   3797 		return (zone_create_error(ENOTSUP, ZE_CHROOTED,
   3798 		    extended_error));
   3799 
   3800 	zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP);
   3801 	zoneid = zone->zone_id = id_alloc(zoneid_space);
   3802 	zone->zone_status = ZONE_IS_UNINITIALIZED;
   3803 	zone->zone_pool = pool_default;
   3804 	zone->zone_pool_mod = gethrtime();
   3805 	zone->zone_psetid = ZONE_PS_INVAL;
   3806 	zone->zone_ncpus = 0;
   3807 	zone->zone_ncpus_online = 0;
   3808 	zone->zone_restart_init = B_TRUE;
   3809 	zone->zone_brand = &native_brand;
   3810 	zone->zone_initname = NULL;
   3811 	mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL);
   3812 	mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
   3813 	mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
   3814 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
   3815 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
   3816 	    offsetof(struct zsd_entry, zsd_linkage));
   3817 	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
   3818 	    offsetof(zone_dataset_t, zd_linkage));
   3819 	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
   3820 	    offsetof(zone_dl_t, zdl_linkage));
   3821 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
   3822 	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
   3823 
   3824 	if (flags & ZCF_NET_EXCL) {
   3825 		zone->zone_flags |= ZF_NET_EXCL;
   3826 	}
   3827 
   3828 	if ((error = zone_set_name(zone, zone_name)) != 0) {
   3829 		zone_free(zone);
   3830 		return (zone_create_error(error, 0, extended_error));
   3831 	}
   3832 
   3833 	if ((error = zone_set_root(zone, zone_root)) != 0) {
   3834 		zone_free(zone);
   3835 		return (zone_create_error(error, 0, extended_error));
   3836 	}
   3837 	if ((error = zone_set_privset(zone, zone_privs, zone_privssz)) != 0) {
   3838 		zone_free(zone);
   3839 		return (zone_create_error(error, 0, extended_error));
   3840 	}
   3841 
   3842 	/* initialize node name to be the same as zone name */
   3843 	zone->zone_nodename = kmem_alloc(_SYS_NMLN, KM_SLEEP);
   3844 	(void) strncpy(zone->zone_nodename, zone->zone_name, _SYS_NMLN);
   3845 	zone->zone_nodename[_SYS_NMLN - 1] = '\0';
   3846 
   3847 	zone->zone_domain = kmem_alloc(_SYS_NMLN, KM_SLEEP);
   3848 	zone->zone_domain[0] = '\0';
   3849 	zone->zone_hostid = HW_INVALID_HOSTID;
   3850 	zone->zone_shares = 1;
   3851 	zone->zone_shmmax = 0;
   3852 	zone->zone_ipc.ipcq_shmmni = 0;
   3853 	zone->zone_ipc.ipcq_semmni = 0;
   3854 	zone->zone_ipc.ipcq_msgmni = 0;
   3855 	zone->zone_bootargs = NULL;
   3856 	zone->zone_initname =
   3857 	    kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
   3858 	(void) strcpy(zone->zone_initname, zone_default_initname);
   3859 	zone->zone_nlwps = 0;
   3860 	zone->zone_nlwps_ctl = INT_MAX;
   3861 	zone->zone_locked_mem = 0;
   3862 	zone->zone_locked_mem_ctl = UINT64_MAX;
   3863 	zone->zone_max_swap = 0;
   3864 	zone->zone_max_swap_ctl = UINT64_MAX;
   3865 	zone0.zone_lockedmem_kstat = NULL;
   3866 	zone0.zone_swapresv_kstat = NULL;
   3867 
   3868 	/*
   3869 	 * Zsched initializes the rctls.
   3870 	 */
   3871 	zone->zone_rctls = NULL;
   3872 
   3873 	if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) {
   3874 		zone_free(zone);
   3875 		return (zone_create_error(error, 0, extended_error));
   3876 	}
   3877 
   3878 	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
   3879 		zone_free(zone);
   3880 		return (set_errno(error));
   3881 	}
   3882 
   3883 	/*
   3884 	 * Read in the trusted system parameters:
   3885 	 * match flag and sensitivity label.
   3886 	 */
   3887 	zone->zone_match = match;
   3888 	if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
   3889 		/* Fail if requested to set doi to anything but system's doi */
   3890 		if (doi != 0 && doi != default_doi) {
   3891 			zone_free(zone);
   3892 			return (set_errno(EINVAL));
   3893 		}
   3894 		/* Always apply system's doi to the zone */
   3895 		error = zone_set_label(zone, label, default_doi);
   3896 		if (error != 0) {
   3897 			zone_free(zone);
   3898 			return (set_errno(error));
   3899 		}
   3900 		insert_label_hash = B_TRUE;
   3901 	} else {
   3902 		/* all zones get an admin_low label if system is not labeled */
   3903 		zone->zone_slabel = l_admin_low;
   3904 		label_hold(l_admin_low);
   3905 		insert_label_hash = B_FALSE;
   3906 	}
   3907 
   3908 	/*
   3909 	 * Stop all lwps since that's what normally happens as part of fork().
   3910 	 * This needs to happen before we grab any locks to avoid deadlock
   3911 	 * (another lwp in the process could be waiting for the held lock).
   3912 	 */
   3913 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK)) {
   3914 		zone_free(zone);
   3915 		if (rctls)
   3916 			nvlist_free(rctls);
   3917 		return (zone_create_error(error, 0, extended_error));
   3918 	}
   3919 
   3920 	if (block_mounts() == 0) {
   3921 		mutex_enter(&pp->p_lock);
   3922 		if (curthread != pp->p_agenttp)
   3923 			continuelwps(pp);
   3924 		mutex_exit(&pp->p_lock);
   3925 		zone_free(zone);
   3926 		if (rctls)
   3927 			nvlist_free(rctls);
   3928 		return (zone_create_error(error, 0, extended_error));
   3929 	}
   3930 
   3931 	/*
   3932 	 * Set up credential for kernel access.  After this, any errors
   3933 	 * should go through the dance in errout rather than calling
   3934 	 * zone_free directly.
   3935 	 */
   3936 	zone->zone_kcred = crdup(kcred);
   3937 	crsetzone(zone->zone_kcred, zone);
   3938 	priv_intersect(zone->zone_privset, &CR_PPRIV(zone->zone_kcred));
   3939 	priv_intersect(zone->zone_privset, &CR_EPRIV(zone->zone_kcred));
   3940 	priv_intersect(zone->zone_privset, &CR_IPRIV(zone->zone_kcred));
   3941 	priv_intersect(zone->zone_privset, &CR_LPRIV(zone->zone_kcred));
   3942 
   3943 	mutex_enter(&zonehash_lock);
   3944 	/*
   3945 	 * Make sure zone doesn't already exist.
   3946 	 *
   3947 	 * If the system and zone are labeled,
   3948 	 * make sure no other zone exists that has the same label.
   3949 	 */
   3950 	if ((ztmp = zone_find_all_by_name(zone->zone_name)) != NULL ||
   3951 	    (insert_label_hash &&
   3952 	    (ztmp = zone_find_all_by_label(zone->zone_slabel)) != NULL)) {
   3953 		zone_status_t status;
   3954 
   3955 		status = zone_status_get(ztmp);
   3956 		if (status == ZONE_IS_READY || status == ZONE_IS_RUNNING)
   3957 			error = EEXIST;
   3958 		else
   3959 			error = EBUSY;
   3960 
   3961 		if (insert_label_hash)
   3962 			error2 = ZE_LABELINUSE;
   3963 
   3964 		goto errout;
   3965 	}
   3966 
   3967 	/*
   3968 	 * Don't allow zone creations which would cause one zone's rootpath to
   3969 	 * be accessible from that of another (non-global) zone.
   3970 	 */
   3971 	if (zone_is_nested(zone->zone_rootpath)) {
   3972 		error = EBUSY;
   3973 		goto errout;
   3974 	}
   3975 
   3976 	ASSERT(zonecount != 0);		/* check for leaks */
   3977 	if (zonecount + 1 > maxzones) {
   3978 		error = ENOMEM;
   3979 		goto errout;
   3980 	}
   3981 
   3982 	if (zone_mount_count(zone->zone_rootpath) != 0) {
   3983 		error = EBUSY;
   3984 		error2 = ZE_AREMOUNTS;
   3985 		goto errout;
   3986 	}
   3987 
   3988 	/*
   3989 	 * Zone is still incomplete, but we need to drop all locks while
   3990 	 * zsched() initializes this zone's kernel process.  We
   3991 	 * optimistically add the zone to the hashtable and associated
   3992 	 * lists so a parallel zone_create() doesn't try to create the
   3993 	 * same zone.
   3994 	 */
   3995 	zonecount++;
   3996 	(void) mod_hash_insert(zonehashbyid,
   3997 	    (mod_hash_key_t)(uintptr_t)zone->zone_id,
   3998 	    (mod_hash_val_t)(uintptr_t)zone);
   3999 	str = kmem_alloc(strlen(zone->zone_name) + 1, KM_SLEEP);
   4000 	(void) strcpy(str, zone->zone_name);
   4001 	(void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)str,
   4002 	    (mod_hash_val_t)(uintptr_t)zone);
   4003 	if (insert_label_hash) {
   4004 		(void) mod_hash_insert(zonehashbylabel,
   4005 		    (mod_hash_key_t)zone->zone_slabel, (mod_hash_val_t)zone);
   4006 		zone->zone_flags |= ZF_HASHED_LABEL;
   4007 	}
   4008 
   4009 	/*
   4010 	 * Insert into active list.  At this point there are no 'hold's
   4011 	 * on the zone, but everyone else knows not to use it, so we can
   4012 	 * continue to use it.  zsched() will do a zone_hold() if the
   4013 	 * newproc() is successful.
   4014 	 */
   4015 	list_insert_tail(&zone_active, zone);
   4016 	mutex_exit(&zonehash_lock);
   4017 
   4018 	zarg.zone = zone;
   4019 	zarg.nvlist = rctls;
   4020 	/*
   4021 	 * The process, task, and project rctls are probably wrong;
   4022 	 * we need an interface to get the default values of all rctls,
   4023 	 * and initialize zsched appropriately.  I'm not sure that that
   4024 	 * makes much of a difference, though.
   4025 	 */
   4026 	if (error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL)) {
   4027 		/*
   4028 		 * We need to undo all globally visible state.
   4029 		 */
   4030 		mutex_enter(&zonehash_lock);
   4031 		list_remove(&zone_active, zone);
   4032 		if (zone->zone_flags & ZF_HASHED_LABEL) {
   4033 			ASSERT(zone->zone_slabel != NULL);
   4034 			(void) mod_hash_destroy(zonehashbylabel,
   4035 			    (mod_hash_key_t)zone->zone_slabel);
   4036 		}
   4037 		(void) mod_hash_destroy(zonehashbyname,
   4038 		    (mod_hash_key_t)(uintptr_t)zone->zone_name);
   4039 		(void) mod_hash_destroy(zonehashbyid,
   4040 		    (mod_hash_key_t)(uintptr_t)zone->zone_id);
   4041 		ASSERT(zonecount > 1);
   4042 		zonecount--;
   4043 		goto errout;
   4044 	}
   4045 
   4046 	/*
   4047 	 * Zone creation can't fail from now on.
   4048 	 */
   4049 
   4050 	/*
   4051 	 * Create zone kstats
   4052 	 */
   4053 	zone_kstat_create(zone);
   4054 
   4055 	/*
   4056 	 * Let the other lwps continue.
   4057 	 */
   4058 	mutex_enter(&pp->p_lock);
   4059 	if (curthread != pp->p_agenttp)
   4060 		continuelwps(pp);
   4061 	mutex_exit(&pp->p_lock);
   4062 
   4063 	/*
   4064 	 * Wait for zsched to finish initializing the zone.
   4065 	 */
   4066 	zone_status_wait(zone, ZONE_IS_READY);
   4067 	/*
   4068 	 * The zone is fully visible, so we can let mounts progress.
   4069 	 */
   4070 	resume_mounts();
   4071 	if (rctls)
   4072 		nvlist_free(rctls);
   4073 
   4074 	return (zoneid);
   4075 
   4076 errout:
   4077 	mutex_exit(&zonehash_lock);
   4078 	/*
   4079 	 * Let the other lwps continue.
   4080 	 */
   4081 	mutex_enter(&pp->p_lock);
   4082 	if (curthread != pp->p_agenttp)
   4083 		continuelwps(pp);
   4084 	mutex_exit(&pp->p_lock);
   4085 
   4086 	resume_mounts();
   4087 	if (rctls)
   4088 		nvlist_free(rctls);
   4089 	/*
   4090 	 * There is currently one reference to the zone, a cred_ref from
   4091 	 * zone_kcred.  To free the zone, we call crfree, which will call
   4092 	 * zone_cred_rele, which will call zone_free.
   4093 	 */
   4094 	ASSERT(zone->zone_cred_ref == 1);	/* for zone_kcred */
   4095 	ASSERT(zone->zone_kcred->cr_ref == 1);
   4096 	ASSERT(zone->zone_ref == 0);
   4097 	zkcr = zone->zone_kcred;
   4098 	zone->zone_kcred = NULL;
   4099 	crfree(zkcr);				/* triggers call to zone_free */
   4100 	return (zone_create_error(error, error2, extended_error));
   4101 }
   4102 
   4103 /*
   4104  * Cause the zone to boot.  This is pretty simple, since we let zoneadmd do
   4105  * the heavy lifting.  initname is the path to the program to launch
   4106  * at the "top" of the zone; if this is NULL, we use the system default,
   4107  * which is stored at zone_default_initname.
   4108  */
   4109 static int
   4110 zone_boot(zoneid_t zoneid)
   4111 {
   4112 	int err;
   4113 	zone_t *zone;
   4114 
   4115 	if (secpolicy_zone_config(CRED()) != 0)
   4116 		return (set_errno(EPERM));
   4117 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
   4118 		return (set_errno(EINVAL));
   4119 
   4120 	mutex_enter(&zonehash_lock);
   4121 	/*
   4122 	 * Look for zone under hash lock to prevent races with calls to
   4123 	 * zone_shutdown, zone_destroy, etc.
   4124 	 */
   4125 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
   4126 		mutex_exit(&zonehash_lock);
   4127 		return (set_errno(EINVAL));
   4128 	}
   4129 
   4130 	mutex_enter(&zone_status_lock);
   4131 	if (zone_status_get(zone) != ZONE_IS_READY) {
   4132 		mutex_exit(&zone_status_lock);
   4133 		mutex_exit(&zonehash_lock);
   4134 		return (set_errno(EINVAL));
   4135 	}
   4136 	zone_status_set(zone, ZONE_IS_BOOTING);
   4137 	mutex_exit(&zone_status_lock);
   4138 
   4139 	zone_hold(zone);	/* so we can use the zone_t later */
   4140 	mutex_exit(&zonehash_lock);
   4141 
   4142 	if (zone_status_wait_sig(zone, ZONE_IS_RUNNING) == 0) {
   4143 		zone_rele(zone);
   4144 		return (set_errno(EINTR));
   4145 	}
   4146 
   4147 	/*
   4148 	 * Boot (starting init) might have failed, in which case the zone
   4149 	 * will go to the SHUTTING_DOWN state; an appropriate errno will
   4150 	 * be placed in zone->zone_boot_err, and so we return that.
   4151 	 */
   4152 	err = zone->zone_boot_err;
   4153 	zone_rele(zone);
   4154 	return (err ? set_errno(err) : 0);
   4155 }
   4156 
   4157 /*
   4158  * Kills all user processes in the zone, waiting for them all to exit
   4159  * before returning.
   4160  */
   4161 static int
   4162 zone_empty(zone_t *zone)
   4163 {
   4164 	int waitstatus;
   4165 
   4166 	/*
   4167 	 * We need to drop zonehash_lock before killing all
   4168 	 * processes, otherwise we'll deadlock with zone_find_*
   4169 	 * which can be called from the exit path.
   4170 	 */
   4171 	ASSERT(MUTEX_NOT_HELD(&zonehash_lock));
   4172 	while ((waitstatus = zone_status_timedwait_sig(zone,
   4173 	    ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) {
   4174 		killall(zone->zone_id);
   4175 	}
   4176 	/*
   4177 	 * return EINTR if we were signaled
   4178 	 */
   4179 	if (waitstatus == 0)
   4180 		return (EINTR);
   4181 	return (0);
   4182 }
   4183 
   4184 /*
   4185  * This function implements the policy for zone visibility.
   4186  *
   4187  * In standard Solaris, a non-global zone can only see itself.
   4188  *
   4189  * In Trusted Extensions, a labeled zone can lookup any zone whose label
   4190  * it dominates. For this test, the label of the global zone is treated as
   4191  * admin_high so it is special-cased instead of being checked for dominance.
   4192  *
   4193  * Returns true if zone attributes are viewable, false otherwise.
   4194  */
   4195 static boolean_t
   4196 zone_list_access(zone_t *zone)
   4197 {
   4198 
   4199 	if (curproc->p_zone == global_zone ||
   4200 	    curproc->p_zone == zone) {
   4201 		return (B_TRUE);
   4202 	} else if (is_system_labeled() && !(zone->zone_flags & ZF_IS_SCRATCH)) {
   4203 		bslabel_t *curproc_label;
   4204 		bslabel_t *zone_label;
   4205 
   4206 		curproc_label = label2bslabel(curproc->p_zone->zone_slabel);
   4207 		zone_label = label2bslabel(zone->zone_slabel);
   4208 
   4209 		if (zone->zone_id != GLOBAL_ZONEID &&
   4210 		    bldominates(curproc_label, zone_label)) {
   4211 			return (B_TRUE);
   4212 		} else {
   4213 			return (B_FALSE);
   4214 		}
   4215 	} else {
   4216 		return (B_FALSE);
   4217 	}
   4218 }
   4219 
   4220 /*
   4221  * Systemcall to start the zone's halt sequence.  By the time this
   4222  * function successfully returns, all user processes and kernel threads
   4223  * executing in it will have exited, ZSD shutdown callbacks executed,
   4224  * and the zone status set to ZONE_IS_DOWN.
   4225  *
   4226  * It is possible that the call will interrupt itself if the caller is the
   4227  * parent of any process running in the zone, and doesn't have SIGCHLD blocked.
   4228  */
   4229 static int
   4230 zone_shutdown(zoneid_t zoneid)
   4231 {
   4232 	int error;
   4233 	zone_t *zone;
   4234 	zone_status_t status;
   4235 
   4236 	if (secpolicy_zone_config(CRED()) != 0)
   4237 		return (set_errno(EPERM));
   4238 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
   4239 		return (set_errno(EINVAL));
   4240 
   4241 	/*
   4242 	 * Block mounts so that VFS_MOUNT() can get an accurate view of
   4243 	 * the zone's status with regards to ZONE_IS_SHUTTING down.
   4244 	 *
   4245 	 * e.g. NFS can fail the mount if it determines that the zone
   4246 	 * has already begun the shutdown sequence.
   4247 	 */
   4248 	if (block_mounts() == 0)
   4249 		return (set_errno(EINTR));
   4250 	mutex_enter(&zonehash_lock);
   4251 	/*
   4252 	 * Look for zone under hash lock to prevent races with other
   4253 	 * calls to zone_shutdown and zone_destroy.
   4254 	 */
   4255 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
   4256 		mutex_exit(&zonehash_lock);
   4257 		resume_mounts();
   4258 		return (set_errno(EINVAL));
   4259 	}
   4260 	mutex_enter(&zone_status_lock);
   4261 	status = zone_status_get(zone);
   4262 	/*
   4263 	 * Fail if the zone isn't fully initialized yet.
   4264 	 */
   4265 	if (status < ZONE_IS_READY) {
   4266 		mutex_exit(&zone_status_lock);
   4267 		mutex_exit(&zonehash_lock);
   4268 		resume_mounts();
   4269 		return (set_errno(EINVAL));
   4270 	}
   4271 	/*
   4272 	 * If conditions required for zone_shutdown() to return have been met,
   4273 	 * return success.
   4274 	 */
   4275 	if (status >= ZONE_IS_DOWN) {
   4276 		mutex_exit(&zone_status_lock);
   4277 		mutex_exit(&zonehash_lock);
   4278 		resume_mounts();
   4279 		return (0);
   4280 	}
   4281 	/*
   4282 	 * If zone_shutdown() hasn't been called before, go through the motions.
   4283 	 * If it has, there's nothing to do but wait for the kernel threads to
   4284 	 * drain.
   4285 	 */
   4286 	if (status < ZONE_IS_EMPTY) {
   4287 		uint_t ntasks;
   4288 
   4289 		mutex_enter(&zone->zone_lock);
   4290 		if ((ntasks = zone->zone_ntasks) != 1) {
   4291 			/*
   4292 			 * There's still stuff running.
   4293 			 */
   4294 			zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
   4295 		}
   4296 		mutex_exit(&zone->zone_lock);
   4297 		if (ntasks == 1) {
   4298 			/*
   4299 			 * The only way to create another task is through
   4300 			 * zone_enter(), which will block until we drop
   4301 			 * zonehash_lock.  The zone is empty.
   4302 			 */
   4303 			if (zone->zone_kthreads == NULL) {
   4304 				/*
   4305 				 * Skip ahead to ZONE_IS_DOWN
   4306 				 */
   4307 				zone_status_set(zone, ZONE_IS_DOWN);
   4308 			} else {
   4309 				zone_status_set(zone, ZONE_IS_EMPTY);
   4310 			}
   4311 		}
   4312 	}
   4313 	zone_hold(zone);	/* so we can use the zone_t later */
   4314 	mutex_exit(&zone_status_lock);
   4315 	mutex_exit(&zonehash_lock);
   4316 	resume_mounts();
   4317 
   4318 	if (error = zone_empty(zone)) {
   4319 		zone_rele(zone);
   4320 		return (set_errno(error));
   4321 	}
   4322 	/*
   4323 	 * After the zone status goes to ZONE_IS_DOWN this zone will no
   4324 	 * longer be notified of changes to the pools configuration, so
   4325 	 * in order to not end up with a stale pool pointer, we point
   4326 	 * ourselves at the default pool and remove all resource
   4327 	 * visibility.  This is especially important as the zone_t may
   4328 	 * languish on the deathrow for a very long time waiting for
   4329 	 * cred's to drain out.
   4330 	 *
   4331 	 * This rebinding of the zone can happen multiple times
   4332 	 * (presumably due to interrupted or parallel systemcalls)
   4333 	 * without any adverse effects.
   4334 	 */
   4335 	if (pool_lock_intr() != 0) {
   4336 		zone_rele(zone);
   4337 		return (set_errno(EINTR));
   4338 	}
   4339 	if (pool_state == POOL_ENABLED) {
   4340 		mutex_enter(&cpu_lock);
   4341 		zone_pool_set(zone, pool_default);
   4342 		/*
   4343 		 * The zone no longer needs to be able to see any cpus.
   4344 		 */
   4345 		zone_pset_set(zone, ZONE_PS_INVAL);
   4346 		mutex_exit(&cpu_lock);
   4347 	}
   4348 	pool_unlock();
   4349 
   4350 	/*
   4351 	 * ZSD shutdown callbacks can be executed multiple times, hence
   4352 	 * it is safe to not be holding any locks across this call.
   4353 	 */
   4354 	zone_zsd_callbacks(zone, ZSD_SHUTDOWN);
   4355 
   4356 	mutex_enter(&zone_status_lock);
   4357 	if (zone->zone_kthreads == NULL && zone_status_get(zone) < ZONE_IS_DOWN)
   4358 		zone_status_set(zone, ZONE_IS_DOWN);
   4359 	mutex_exit(&zone_status_lock);
   4360 
   4361 	/*
   4362 	 * Wait for kernel threads to drain.
   4363 	 */
   4364 	if (!zone_status_wait_sig(zone, ZONE_IS_DOWN)) {
   4365 		zone_rele(zone);
   4366 		return (set_errno(EINTR));
   4367 	}
   4368 
   4369 	/*
   4370 	 * Zone can be become down/destroyable even if the above wait
   4371 	 * returns EINTR, so any code added here may never execute.
   4372 	 * (i.e. don't add code here)
   4373 	 */
   4374 
   4375 	zone_rele(zone);
   4376 	return (0);
   4377 }
   4378 
   4379 /*
   4380  * Systemcall entry point to finalize the zone halt process.  The caller
   4381  * must have already successfully called zone_shutdown().
   4382  *
   4383  * Upon successful completion, the zone will have been fully destroyed:
   4384  * zsched will have exited, destructor callbacks executed, and the zone
   4385  * removed from the list of active zones.
   4386  */
   4387 static int
   4388 zone_destroy(zoneid_t zoneid)
   4389 {
   4390 	uint64_t uniqid;
   4391 	zone_t *zone;
   4392 	zone_status_t status;
   4393 
   4394 	if (secpolicy_zone_config(CRED()) != 0)
   4395 		return (set_errno(EPERM));
   4396 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
   4397 		return (set_errno(EINVAL));
   4398 
   4399 	mutex_enter(&zonehash_lock);
   4400 	/*
   4401 	 * Look for zone under hash lock to prevent races with other
   4402 	 * calls to zone_destroy.
   4403 	 */
   4404 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
   4405 		mutex_exit(&zonehash_lock);
   4406 		return (set_errno(EINVAL));
   4407 	}
   4408 
   4409 	if (zone_mount_count(zone->zone_rootpath) != 0) {
   4410 		mutex_exit(&zonehash_lock);
   4411 		return (set_errno(EBUSY));
   4412 	}
   4413 	mutex_enter(&zone_status_lock);
   4414 	status = zone_status_get(zone);
   4415 	if (status < ZONE_IS_DOWN) {
   4416 		mutex_exit(&zone_status_lock);
   4417 		mutex_exit(&zonehash_lock);
   4418 		return (set_errno(EBUSY));
   4419 	} else if (status == ZONE_IS_DOWN) {
   4420 		zone_status_set(zone, ZONE_IS_DYING); /* Tell zsched to exit */
   4421 	}
   4422 	mutex_exit(&zone_status_lock);
   4423 	zone_hold(zone);
   4424 	mutex_exit(&zonehash_lock);
   4425 
   4426 	/*
   4427 	 * wait for zsched to exit
   4428 	 */
   4429 	zone_status_wait(zone, ZONE_IS_DEAD);
   4430 	zone_zsd_callbacks(zone, ZSD_DESTROY);
   4431 	zone->zone_netstack = NULL;
   4432 	uniqid = zone->zone_uniqid;
   4433 	zone_rele(zone);
   4434 	zone = NULL;	/* potentially free'd */
   4435 
   4436 	mutex_enter(&zonehash_lock);
   4437 	for (; /* ever */; ) {
   4438 		boolean_t unref;
   4439 
   4440 		if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
   4441 		    zone->zone_uniqid != uniqid) {
   4442 			/*
   4443 			 * The zone has gone away.  Necessary conditions
   4444 			 * are met, so we return success.
   4445 			 */
   4446 			mutex_exit(&zonehash_lock);
   4447 			return (0);
   4448 		}
   4449 		mutex_enter(&zone->zone_lock);
   4450 		unref = ZONE_IS_UNREF(zone);
   4451 		mutex_exit(&zone->zone_lock);
   4452 		if (unref) {
   4453 			/*
   4454 			 * There is only one reference to the zone -- that
   4455 			 * added when the zone was added to the hashtables --
   4456 			 * and things will remain this way until we drop
   4457 			 * zonehash_lock... we can go ahead and cleanup the
   4458 			 * zone.
   4459 			 */
   4460 			break;
   4461 		}
   4462 
   4463 		if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
   4464 			/* Signaled */
   4465 			mutex_exit(&zonehash_lock);
   4466 			return (set_errno(EINTR));
   4467 		}
   4468 
   4469 	}
   4470 
   4471 	/*
   4472 	 * Remove CPU cap for this zone now since we're not going to
   4473 	 * fail below this point.
   4474 	 */
   4475 	cpucaps_zone_remove(zone);
   4476 
   4477 	/* Get rid of the zone's kstats */
   4478 	zone_kstat_delete(zone);
   4479 
   4480 	/* free brand specific data */
   4481 	if (ZONE_IS_BRANDED(zone))
   4482 		ZBROP(zone)->b_free_brand_data(zone);
   4483 
   4484 	/* Say goodbye to brand framework. */
   4485 	brand_unregister_zone(zone->zone_brand);
   4486 
   4487 	/*
   4488 	 * It is now safe to let the zone be recreated; remove it from the
   4489 	 * lists.  The memory will not be freed until the last cred
   4490 	 * reference goes away.
   4491 	 */
   4492 	ASSERT(zonecount > 1);	/* must be > 1; can't destroy global zone */
   4493 	zonecount--;
   4494 	/* remove from active list and hash tables */
   4495 	list_remove(&zone_active, zone);
   4496 	(void) mod_hash_destroy(zonehashbyname,
   4497 	    (mod_hash_key_t)zone->zone_name);
   4498 	(void) mod_hash_destroy(zonehashbyid,
   4499 	    (mod_hash_key_t)(uintptr_t)zone->zone_id);
   4500 	if (zone->zone_flags & ZF_HASHED_LABEL)
   4501 		(void) mod_hash_destroy(zonehashbylabel,
   4502 		    (mod_hash_key_t)zone->zone_slabel);
   4503 	mutex_exit(&zonehash_lock);
   4504 
   4505 	/*
   4506 	 * Release the root vnode; we're not using it anymore.  Nor should any
   4507 	 * other thread that might access it exist.
   4508 	 */
   4509 	if (zone->zone_rootvp != NULL) {
   4510 		VN_RELE(zone->zone_rootvp);
   4511 		zone->zone_rootvp = NULL;
   4512 	}
   4513 
   4514 	/* add to deathrow list */
   4515 	mutex_enter(&zone_deathrow_lock);
   4516 	list_insert_tail(&zone_deathrow, zone);
   4517 	mutex_exit(&zone_deathrow_lock);
   4518 
   4519 	/*
   4520 	 * Drop last reference (which was added by zsched()), this will
   4521 	 * free the zone unless there are outstanding cred references.
   4522 	 */
   4523 	zone_rele(zone);
   4524 	return (0);
   4525 }
   4526 
   4527 /*
   4528  * Systemcall entry point for zone_getattr(2).
   4529  */
   4530 static ssize_t
   4531 zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
   4532 {
   4533 	size_t size;
   4534 	int error = 0, err;
   4535 	zone_t *zone;
   4536 	char *zonepath;
   4537 	char *outstr;
   4538 	zone_status_t zone_status;
   4539 	pid_t initpid;
   4540 	boolean_t global = (curzone == global_zone);
   4541 	boolean_t inzone = (curzone->zone_id == zoneid);
   4542 	ushort_t flags;
   4543 
   4544 	mutex_enter(&zonehash_lock);
   4545 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
   4546 		mutex_exit(&zonehash_lock);
   4547 		return (set_errno(EINVAL));
   4548 	}
   4549 	zone_status = zone_status_get(zone);
   4550 	if (zone_status < ZONE_IS_INITIALIZED) {
   4551 		mutex_exit(&zonehash_lock);
   4552 		return (set_errno(EINVAL));
   4553 	}
   4554 	zone_hold(zone);
   4555 	mutex_exit(&zonehash_lock);
   4556 
   4557 	/*
   4558 	 * If not in the global zone, don't show information about other zones,
   4559 	 * unless the system is labeled and the local zone's label dominates
   4560 	 * the other zone.
   4561 	 */
   4562 	if (!zone_list_access(zone)) {
   4563 		zone_rele(zone);
   4564 		return (set_errno(EINVAL));
   4565 	}
   4566 
   4567 	switch (attr) {
   4568 	case ZONE_ATTR_ROOT:
   4569 		if (global) {
   4570 			/*
   4571 			 * Copy the path to trim the trailing "/" (except for
   4572 			 * the global zone).
   4573 			 */
   4574 			if (zone != global_zone)
   4575 				size = zone->zone_rootpathlen - 1;
   4576 			else
   4577 				size = zone->zone_rootpathlen;
   4578 			zonepath = kmem_alloc(size, KM_SLEEP);
   4579 			bcopy(zone->zone_rootpath, zonepath, size);
   4580 			zonepath[size - 1] = '\0';
   4581 		} else {
   4582 			if (inzone || !is_system_labeled()) {
   4583 				/*
   4584 				 * Caller is not in the global zone.
   4585 				 * if the query is on the current zone
   4586 				 * or the system is not labeled,
   4587 				 * just return faked-up path for current zone.
   4588 				 */
   4589 				zonepath = "/";
   4590 				size = 2;
   4591 			} else {
   4592 				/*
   4593 				 * Return related path for current zone.
   4594 				 */
   4595 				int prefix_len = strlen(zone_prefix);
   4596 				int zname_len = strlen(zone->zone_name);
   4597 
   4598 				size = prefix_len + zname_len + 1;
   4599 				zonepath = kmem_alloc(size, KM_SLEEP);
   4600 				bcopy(zone_prefix, zonepath, prefix_len);
   4601 				bcopy(zone->zone_name, zonepath +
   4602 				    prefix_len, zname_len);
   4603 				zonepath[size - 1] = '\0';
   4604 			}
   4605 		}
   4606 		if (bufsize > size)
   4607 			bufsize = size;
   4608 		if (buf != NULL) {
   4609 			err = copyoutstr(zonepath, buf, bufsize, NULL);
   4610 			if (err != 0 && err != ENAMETOOLONG)
   4611 				error = EFAULT;
   4612 		}
   4613 		if (global || (is_system_labeled() && !inzone))
   4614 			kmem_free(zonepath, size);
   4615 		break;
   4616 
   4617 	case ZONE_ATTR_NAME:
   4618 		size = strlen(zone->zone_name) + 1;
   4619 		if (bufsize > size)
   4620 			bufsize = size;
   4621 		if (buf != NULL) {
   4622 			err = copyoutstr(zone->zone_name, buf, bufsize, NULL);
   4623 			if (err != 0 && err != ENAMETOOLONG)
   4624 				error = EFAULT;
   4625 		}
   4626 		break;
   4627 
   4628 	case ZONE_ATTR_STATUS:
   4629 		/*
   4630 		 * Since we're not holding zonehash_lock, the zone status
   4631 		 * may be anything; leave it up to userland to sort it out.
   4632 		 */
   4633 		size = sizeof (zone_status);
   4634 		if (bufsize > size)
   4635 			bufsize = size;
   4636 		zone_status = zone_status_get(zone);
   4637 		if (buf != NULL &&
   4638 		    copyout(&zone_status, buf, bufsize) != 0)
   4639 			error = EFAULT;
   4640 		break;
   4641 	case ZONE_ATTR_FLAGS:
   4642 		size = sizeof (zone->zone_flags);
   4643 		if (bufsize > size)
   4644 			bufsize = size;
   4645 		flags = zone->zone_flags;
   4646 		if (buf != NULL &&
   4647 		    copyout(&flags, buf, bufsize) != 0)
   4648 			error = EFAULT;
   4649 		break;
   4650 	case ZONE_ATTR_PRIVSET:
   4651 		size = sizeof (priv_set_t);
   4652 		if (bufsize > size)
   4653 			bufsize = size;
   4654 		if (buf != NULL &&
   4655 		    copyout(zone->zone_privset, buf, bufsize) != 0)
   4656 			error = EFAULT;
   4657 		break;
   4658 	case ZONE_ATTR_UNIQID:
   4659 		size = sizeof (zone->zone_uniqid);
   4660 		if (bufsize > size)
   4661 			bufsize = size;
   4662 		if (buf != NULL &&
   4663 		    copyout(&zone->zone_uniqid, buf, bufsize) != 0)
   4664 			error = EFAULT;
   4665 		break;
   4666 	case ZONE_ATTR_POOLID:
   4667 		{
   4668 			pool_t *pool;
   4669 			poolid_t poolid;
   4670 
   4671 			if (pool_lock_intr() != 0) {
   4672 				error = EINTR;
   4673 				break;
   4674 			}
   4675 			pool = zone_pool_get(zone);
   4676 			poolid = pool->pool_id;
   4677 			pool_unlock();
   4678 			size = sizeof (poolid);
   4679 			if (bufsize > size)
   4680 				bufsize = size;
   4681 			if (buf != NULL && copyout(&poolid, buf, size) != 0)
   4682 				error = EFAULT;
   4683 		}
   4684 		break;
   4685 	case ZONE_ATTR_SLBL:
   4686 		size = sizeof (bslabel_t);
   4687 		if (bufsize > size)
   4688 			bufsize = size;
   4689 		if (zone->zone_slabel == NULL)
   4690 			error = EINVAL;
   4691 		else if (buf != NULL &&
   4692 		    copyout(label2bslabel(zone->zone_slabel), buf,
   4693 		    bufsize) != 0)
   4694 			error = EFAULT;
   4695 		break;
   4696 	case ZONE_ATTR_INITPID:
   4697 		size = sizeof (initpid);
   4698 		if (bufsize > size)
   4699 			bufsize = size;
   4700 		initpid = zone->zone_proc_initpid;
   4701 		if (initpid == -1) {
   4702 			error = ESRCH;
   4703 			break;
   4704 		}
   4705 		if (buf != NULL &&
   4706 		    copyout(&initpid, buf, bufsize) != 0)
   4707 			error = EFAULT;
   4708 		break;
   4709 	case ZONE_ATTR_BRAND:
   4710 		size = strlen(zone->zone_brand->b_name) + 1;
   4711 
   4712 		if (bufsize > size)
   4713 			bufsize = size;
   4714 		if (buf != NULL) {
   4715 			err = copyoutstr(zone->zone_brand->b_name, buf,
   4716 			    bufsize, NULL);
   4717 			if (err != 0 && err != ENAMETOOLONG)
   4718 				error = EFAULT;
   4719 		}
   4720 		break;
   4721 	case ZONE_ATTR_INITNAME:
   4722 		size = strlen(zone->zone_initname) + 1;
   4723 		if (bufsize > size)
   4724 			bufsize = size;
   4725 		if (buf != NULL) {
   4726 			err = copyoutstr(zone->zone_initname, buf, bufsize,
   4727 			    NULL);
   4728 			if (err != 0 && err != ENAMETOOLONG)
   4729 				error = EFAULT;
   4730 		}
   4731 		break;
   4732 	case ZONE_ATTR_BOOTARGS:
   4733 		if (zone->zone_bootargs == NULL)
   4734 			outstr = "";
   4735 		else
   4736 			outstr = zone->zone_bootargs;
   4737 		size = strlen(outstr) + 1;
   4738 		if (bufsize > size)
   4739 			bufsize = size;
   4740 		if (buf != NULL) {
   4741 			err = copyoutstr(outstr, buf, bufsize, NULL);
   4742 			if (err != 0 && err != ENAMETOOLONG)
   4743 				error = EFAULT;
   4744 		}
   4745 		break;
   4746 	case ZONE_ATTR_PHYS_MCAP:
   4747 		size = sizeof (zone->zone_phys_mcap);
   4748 		if (bufsize > size)
   4749 			bufsize = size;
   4750 		if (buf != NULL &&
   4751 		    copyout(&zone->zone_phys_mcap, buf, bufsize) != 0)
   4752 			error = EFAULT;
   4753 		break;
   4754 	case ZONE_ATTR_SCHED_CLASS:
   4755 		mutex_enter(&class_lock);
   4756 
   4757 		if (zone->zone_defaultcid >= loaded_classes)
   4758 			outstr = "";
   4759 		else
   4760 			outstr = sclass[zone->zone_defaultcid].cl_name;
   4761 		size = strlen(outstr) + 1;
   4762 		if (bufsize > size)
   4763 			bufsize = size;
   4764 		if (buf != NULL) {
   4765 			err = copyoutstr(outstr, buf, bufsize, NULL);
   4766 			if (err != 0 && err != ENAMETOOLONG)
   4767 				error = EFAULT;
   4768 		}
   4769 
   4770 		mutex_exit(&class_lock);
   4771 		break;
   4772 	case ZONE_ATTR_HOSTID:
   4773 		if (zone->zone_hostid != HW_INVALID_HOSTID &&
   4774 		    bufsize == sizeof (zone->zone_hostid)) {
   4775 			size = sizeof (zone->zone_hostid);
   4776 			if (buf != NULL && copyout(&zone->zone_hostid, buf,
   4777 			    bufsize) != 0)
   4778 				error = EFAULT;
   4779 		} else {
   4780 			error = EINVAL;
   4781 		}
   4782 		break;
   4783 	default:
   4784 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) {
   4785 			size = bufsize;
   4786 			error = ZBROP(zone)->b_getattr(zone, attr, buf, &size);
   4787 		} else {
   4788 			error = EINVAL;
   4789 		}
   4790 	}
   4791 	zone_rele(zone);
   4792 
   4793 	if (error)
   4794 		return (set_errno(error));
   4795 	return ((ssize_t)size);
   4796 }
   4797 
   4798 /*
   4799  * Systemcall entry point for zone_setattr(2).
   4800  */
   4801 /*ARGSUSED*/
   4802 static int
   4803 zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
   4804 {
   4805 	zone_t *zone;
   4806 	zone_status_t zone_status;
   4807 	int err;
   4808 
   4809 	if (secpolicy_zone_config(CRED()) != 0)
   4810 		return (set_errno(EPERM));
   4811 
   4812 	/*
   4813 	 * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the
   4814 	 * global zone.
   4815 	 */
   4816 	if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) {
   4817 		return (set_errno(EINVAL));
   4818 	}
   4819 
   4820 	mutex_enter(&zonehash_lock);
   4821 	if ((zone = zone_find_all_by_id(zoneid)) == NULL) {
   4822 		mutex_exit(&zonehash_lock);
   4823 		return (set_errno(EINVAL));
   4824 	}
   4825 	zone_hold(zone);
   4826 	mutex_exit(&zonehash_lock);
   4827 
   4828 	/*
   4829 	 * At present most attributes can only be set on non-running,
   4830 	 * non-global zones.
   4831 	 */
   4832 	zone_status = zone_status_get(zone);
   4833 	if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
   4834 		goto done;
   4835 
   4836 	switch (attr) {
   4837 	case ZONE_ATTR_INITNAME:
   4838 		err = zone_set_initname(zone, (const char *)buf);
   4839 		break;
   4840 	case ZONE_ATTR_BOOTARGS:
   4841 		err = zone_set_bootargs(zone, (const char *)buf);
   4842 		break;
   4843 	case ZONE_ATTR_BRAND:
   4844 		err = zone_set_brand(zone, (const char *)buf);
   4845 		break;
   4846 	case ZONE_ATTR_PHYS_MCAP:
   4847 		err = zone_set_phys_mcap(zone, (const uint64_t *)buf);
   4848 		break;
   4849 	case ZONE_ATTR_SCHED_CLASS:
   4850 		err = zone_set_sched_class(zone, (const char *)buf);
   4851 		break;
   4852 	case ZONE_ATTR_HOSTID:
   4853 		if (bufsize == sizeof (zone->zone_hostid)) {
   4854 			if (copyin(buf, &zone->zone_hostid, bufsize) == 0)
   4855 				err = 0;
   4856 			else
   4857 				err = EFAULT;
   4858 		} else {
   4859 			err = EINVAL;
   4860 		}
   4861 		break;
   4862 	default:
   4863 		if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone))
   4864 			err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize);
   4865 		else
   4866 			err = EINVAL;
   4867 	}
   4868 
   4869 done:
   4870 	zone_rele(zone);
   4871 	return (err != 0 ? set_errno(err) : 0);
   4872 }
   4873 
   4874 /*
   4875  * Return zero if the process has at least one vnode mapped in to its
   4876  * address space which shouldn't be allowed to change zones.
   4877  *
   4878  * Also return zero if the process has any shared mappings which reserve
   4879  * swap.  This is because the counting for zone.max-swap does not allow swap
   4880  * reservation to be shared between zones.  zone swap reservation is counted
   4881  * on zone->zone_max_swap.
   4882  */
   4883 static int
   4884 as_can_change_zones(void)
   4885 {
   4886 	proc_t *pp = curproc;
   4887 	struct seg *seg;
   4888 	struct as *as = pp->p_as;
   4889 	vnode_t *vp;
   4890 	int allow = 1;
   4891 
   4892 	ASSERT(pp->p_as != &kas);
   4893 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
   4894 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
   4895 
   4896 		/*
   4897 		 * Cannot enter zone with shared anon memory which
   4898 		 * reserves swap.  See comment above.
   4899 		 */
   4900 		if (seg_can_change_zones(seg) == B_FALSE) {
   4901 			allow = 0;
   4902 			break;
   4903 		}
   4904 		/*
   4905 		 * if we can't get a backing vnode for this segment then skip
   4906 		 * it.
   4907 		 */
   4908 		vp = NULL;
   4909 		if (SEGOP_GETVP(seg, seg->s_base, &vp) != 0 || vp == NULL)
   4910 			continue;
   4911 		if (!vn_can_change_zones(vp)) { /* bail on first match */
   4912 			allow = 0;
   4913 			break;
   4914 		}
   4915 	}
   4916 	AS_LOCK_EXIT(as, &as->a_lock);
   4917 	return (allow);
   4918 }
   4919 
   4920 /*
   4921  * Count swap reserved by curproc's address space
   4922  */
   4923 static size_t
   4924 as_swresv(void)
   4925 {
   4926 	proc_t *pp = curproc;
   4927 	struct seg *seg;
   4928 	struct as *as = pp->p_as;
   4929 	size_t swap = 0;
   4930 
   4931 	ASSERT(pp->p_as != &kas);
   4932 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
   4933 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg))
   4934 		swap += seg_swresv(seg);
   4935 
   4936 	return (swap);
   4937 }
   4938 
   4939 /*
   4940  * Systemcall entry point for zone_enter().
   4941  *
   4942  * The current process is injected into said zone.  In the process
   4943  * it will change its project membership, privileges, rootdir/cwd,
   4944  * zone-wide rctls, and pool association to match those of the zone.
   4945  *
   4946  * The first zone_enter() called while the zone is in the ZONE_IS_READY
   4947  * state will transition it to ZONE_IS_RUNNING.  Processes may only
   4948  * enter a zone that is "ready" or "running".
   4949  */
   4950 static int
   4951 zone_enter(zoneid_t zoneid)
   4952 {
   4953 	zone_t *zone;
   4954 	vnode_t *vp;
   4955 	proc_t *pp = curproc;
   4956 	contract_t *ct;
   4957 	cont_process_t *ctp;
   4958 	task_t *tk, *oldtk;
   4959 	kproject_t *zone_proj0;
   4960 	cred_t *cr, *newcr;
   4961 	pool_t *oldpool, *newpool;
   4962 	sess_t *sp;
   4963 	uid_t uid;
   4964 	zone_status_t status;
   4965 	int err = 0;
   4966 	rctl_entity_p_t e;
   4967 	size_t swap;
   4968 	kthread_id_t t;
   4969 
   4970 	if (secpolicy_zone_config(CRED()) != 0)
   4971 		return (set_errno(EPERM));
   4972 	if (zoneid < MIN_USERZONEID || zoneid > MAX_ZONEID)
   4973 		return (set_errno(EINVAL));
   4974 
   4975 	/*
   4976 	 * Stop all lwps so we don't need to hold a lock to look at
   4977 	 * curproc->p_zone.  This needs to happen before we grab any
   4978 	 * locks to avoid deadlock (another lwp in the process could
   4979 	 * be waiting for the held lock).
   4980 	 */
   4981 	if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK))
   4982 		return (set_errno(EINTR));
   4983 
   4984 	/*
   4985 	 * Make sure we're not changing zones with files open or mapped in
   4986 	 * to our address space which shouldn't be changing zones.
   4987 	 */
   4988 	if (!files_can_change_zones()) {
   4989 		err = EBADF;
   4990 		goto out;
   4991 	}
   4992 	if (!as_can_change_zones()) {
   4993 		err = EFAULT;
   4994 		goto out;
   4995 	}
   4996 
   4997 	mutex_enter(&zonehash_lock);
   4998 	if (pp->p_zone != global_zone) {
   4999 		mutex_exit(&zonehash_lock);
   5000 		err = EINVAL;
   5001 		goto out;
   5002 	}
   5003 
   5004 	zone = zone_find_all_by_id(zoneid);
   5005 	if (zone == NULL) {
   5006 		mutex_exit(&zonehash_lock);
   5007 		err = EINVAL;
   5008 		goto out;
   5009 	}
   5010 
   5011 	/*
   5012 	 * To prevent processes in a zone from holding contracts on
   5013 	 * extrazonal resources, and to avoid process contract
   5014 	 * memberships which span zones, contract holders and processes
   5015 	 * which aren't the sole members of their encapsulating process
   5016 	 * contracts are not allowed to zone_enter.
   5017 	 */
   5018 	ctp = pp->p_ct_process;
   5019 	ct = &ctp->conp_contract;
   5020 	mutex_enter(&ct->ct_lock);
   5021 	mutex_enter(&pp->p_lock);
   5022 	if ((avl_numnodes(&pp->p_ct_held) != 0) || (ctp->conp_nmembers != 1)) {
   5023 		mutex_exit(&pp->p_lock);
   5024 		mutex_exit(&ct->ct_lock);
   5025 		mutex_exit(&zonehash_lock);
   5026 		err = EINVAL;
   5027 		goto out;
   5028 	}
   5029 
   5030 	/*
   5031 	 * Moreover, we don't allow processes whose encapsulating
   5032 	 * process contracts have inherited extrazonal contracts.
   5033 	 * While it would be easier to eliminate all process contracts
   5034 	 * with inherited contracts, we need to be able to give a
   5035 	 * restarted init (or other zone-penetrating process) its
   5036 	 * predecessor's contracts.
   5037 	 */
   5038 	if (ctp->conp_ninherited != 0) {
   5039 		contract_t *next;
   5040 		for (next = list_head(&ctp->conp_inherited); next;
   5041 		    next = list_next(&ctp->conp_inherited, next)) {
   5042 			if (contract_getzuniqid(next) != zone->zone_uniqid) {
   5043 				mutex_exit(&pp->p_lock);
   5044 				mutex_exit(&ct->ct_lock);
   5045 				mutex_exit(&zonehash_lock);
   5046 				err = EINVAL;
   5047 				goto out;
   5048 			}
   5049 		}
   5050 	}
   5051 
   5052 	mutex_exit(&pp->p_lock);
   5053 	mutex_exit(&ct->ct_lock);
   5054 
   5055 	status = zone_status_get(zone);
   5056 	if (status < ZONE_IS_READY || status >= ZONE_IS_SHUTTING_DOWN) {
   5057 		/*
   5058 		 * Can't join
   5059 		 */
   5060 		mutex_exit(&zonehash_lock);
   5061 		err = EINVAL;
   5062 		goto out;
   5063 	}
   5064 
   5065 	/*
   5066 	 * Make sure new priv set is within the permitted set for caller
   5067 	 */
   5068 	if (!priv_issubset(zone->zone_privset, &CR_OPPRIV(CRED()))) {
   5069 		mutex_exit(&zonehash_lock);
   5070 		err = EPERM;
   5071 		goto out;
   5072 	}
   5073 	/*
   5074 	 * We want to momentarily drop zonehash_lock while we optimistically
   5075 	 * bind curproc to the pool it should be running in.  This is safe
   5076 	 * since the zone can't disappear (we have a hold on it).
   5077 	 */
   5078 	zone_hold(zone);
   5079 	mutex_exit(&zonehash_lock);
   5080 
   5081 	/*
   5082 	 * Grab pool_lock to keep the pools configuration from changing
   5083 	 * and to stop ourselves from getting rebound to another pool
   5084 	 * until we join the zone.
   5085 	 */
   5086 	if (pool_lock_intr() != 0) {
   5087 		zone_rele(zone);
   5088 		err = EINTR;
   5089 		goto out;
   5090 	}
   5091 	ASSERT(secpolicy_pool(CRED()) == 0);
   5092 	/*
   5093 	 * Bind ourselves to the pool currently associated with the zone.
   5094 	 */
   5095 	oldpool = curproc->p_pool;
   5096 	newpool = zone_pool_get(zone);
   5097 	if (pool_state == POOL_ENABLED && newpool != oldpool &&
   5098 	    (err = pool_do_bind(newpool, P_PID, P_MYID,
   5099 	    POOL_BIND_ALL)) != 0) {
   5100 		pool_unlock();
   5101 		zone_rele(zone);
   5102 		goto out;
   5103 	}
   5104 
   5105 	/*
   5106 	 * Grab cpu_lock now; we'll need it later when we call
   5107 	 * task_join().
   5108 	 */
   5109 	mutex_enter(&cpu_lock);
   5110 	mutex_enter(&zonehash_lock);
   5111 	/*
   5112 	 * Make sure the zone hasn't moved on since we dropped zonehash_lock.
   5113 	 */
   5114 	if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
   5115 		/*
   5116 		 * Can't join anymore.
   5117 		 */
   5118 		mutex_exit(&zonehash_lock);
   5119 		mutex_exit(&cpu_lock);
   5120 		if (pool_state == POOL_ENABLED &&
   5121 		    newpool != oldpool)
   5122 			(void) pool_do_bind(oldpool, P_PID, P_MYID,
   5123 			    POOL_BIND_ALL);
   5124 		pool_unlock();
   5125 		zone_rele(zone);
   5126 		err = EINVAL;
   5127 		goto out;
   5128 	}
   5129 
   5130 	/*
   5131 	 * a_lock must be held while transfering locked memory and swap
   5132 	 * reservation from the global zone to the non global zone because
   5133 	 * asynchronous faults on the processes' address space can lock
   5134 	 * memory and reserve swap via MCL_FUTURE and MAP_NORESERVE
   5135 	 * segments respectively.
   5136 	 */
   5137 	AS_LOCK_ENTER(pp->as, &pp->p_as->a_lock, RW_WRITER);
   5138 	swap = as_swresv();
   5139 	mutex_enter(&pp->p_lock);
   5140 	zone_proj0 = zone->zone_zsched->p_task->tk_proj;
   5141 	/* verify that we do not exceed and task or lwp limits */
   5142 	mutex_enter(&zone->zone_nlwps_lock);
   5143 	/* add new lwps to zone and zone's proj0 */
   5144 	zone_proj0->kpj_nlwps += pp->p_lwpcnt;
   5145 	zone->zone_nlwps += pp->p_lwpcnt;
   5146 	/* add 1 task to zone's proj0 */
   5147 	zone_proj0->kpj_ntasks += 1;
   5148 	mutex_exit(&zone->zone_nlwps_lock);
   5149 
   5150 	mutex_enter(&zone->zone_mem_lock);
   5151 	zone->zone_locked_mem += pp->p_locked_mem;
   5152 	zone_proj0->kpj_data.kpd_locked_mem += pp->p_locked_mem;
   5153 	zone->zone_max_swap += swap;
   5154 	mutex_exit(&zone->zone_mem_lock);
   5155 
   5156 	mutex_enter(&(zone_proj0->kpj_data.kpd_crypto_lock));
   5157 	zone_proj0->kpj_data.kpd_crypto_mem += pp->p_crypto_mem;
   5158 	mutex_exit(&(zone_proj0->kpj_data.kpd_crypto_lock));
   5159 
   5160 	/* remove lwps from proc's old zone and old project */
   5161 	mutex_enter(&pp->p_zone->zone_nlwps_lock);
   5162 	pp->p_zone->zone_nlwps -= pp->p_lwpcnt;
   5163 	pp->p_task->tk_proj->kpj_nlwps -= pp->p_lwpcnt;
   5164 	mutex_exit(&pp->p_zone->zone_nlwps_lock);
   5165 
   5166 	mutex_enter(&pp->p_zone->zone_mem_lock);
   5167 	pp->p_zone->zone_locked_mem -= pp->p_locked_mem;
   5168 	pp->p_task->tk_proj->kpj_data.kpd_locked_mem -= pp->p_locked_mem;
   5169 	pp->p_zone->zone_max_swap -= swap;
   5170 	mutex_exit(&pp->p_zone->zone_mem_lock);
   5171 
   5172 	mutex_enter(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
   5173 	pp->p_task->tk_proj->kpj_data.kpd_crypto_mem -= pp->p_crypto_mem;
   5174 	mutex_exit(&(pp->p_task->tk_proj->kpj_data.kpd_crypto_lock));
   5175 
   5176 	pp->p_flag |= SZONETOP;
   5177 	pp->p_zone = zone;
   5178 	mutex_exit(&pp->p_lock);
   5179 	AS_LOCK_EXIT(pp->p_as, &pp->p_as->a_lock);
   5180 
   5181 	/*
   5182 	 * Joining the zone cannot fail from now on.
   5183 	 *
   5184 	 * This means that a lot of the following code can be commonized and
   5185 	 * shared with zsched().
   5186 	 */
   5187 
   5188 	/*
   5189 	 * If the process contract fmri was inherited, we need to
   5190 	 * flag this so that any contract status will not leak
   5191 	 * extra zone information, svc_fmri in this case
   5192 	 */
   5193 	if (ctp->conp_svc_ctid != ct->ct_id) {
   5194 		mutex_enter(&ct->ct_lock);
   5195 		ctp->conp_svc_zone_enter = ct->ct_id;
   5196 		mutex_exit(&ct->ct_lock);
   5197 	}
   5198 
   5199 	/*
   5200 	 * Reset the encapsulating process contract's zone.
   5201 	 */
   5202 	ASSERT(ct->ct_mzuniqid == GLOBAL_ZONEUNIQID);
   5203 	contract_setzuniqid(ct, zone->zone_uniqid);
   5204 
   5205 	/*
   5206 	 * Create a new task and associate the process with the project keyed
   5207 	 * by (projid,zoneid).
   5208 	 *
   5209 	 * We might as well be in project 0; the global zone's projid doesn't
   5210 	 * make much sense in a zone anyhow.
   5211 	 *
   5212 	 * This also increments zone_ntasks, and returns with p_lock held.
   5213 	 */
   5214 	tk = task_create(0, zone);
   5215 	oldtk = task_join(tk, 0);
   5216 	mutex_exit(&cpu_lock);
   5217 
   5218 	/*
   5219 	 * call RCTLOP_SET functions on this proc
   5220 	 */
   5221 	e.rcep_p.zone = zone;
   5222 	e.rcep_t = RCENTITY_ZONE;
   5223 	(void) rctl_set_dup(NULL, NULL, pp, &e, zone->zone_rctls, NULL,
   5224 	    RCD_CALLBACK);
   5225 	mutex_exit(&pp->p_lock);
   5226 
   5227 	/*
   5228 	 * We don't need to hold any of zsched's locks here; not only do we know
   5229 	 * the process and zone aren't going away, we know its session isn't
   5230 	 * changing either.
   5231 	 *
   5232 	 * By joining zsched's session here, we mimic the behavior in the
   5233 	 * global zone of init's sid being the pid of sched.  We extend this
   5234 	 * to all zlogin-like zone_enter()'ing processes as well.
   5235 	 */
   5236 	mutex_enter(&pidlock);
   5237 	sp = zone->zone_zsched->p_sessp;
   5238 	sess_hold(zone->zone_zsched);
   5239 	mutex_enter(&pp->p_lock);
   5240 	pgexit(pp);
   5241 	sess_rele(pp->p_sessp, B_TRUE);
   5242 	pp->p_sessp = sp;
   5243 	pgjoin(pp, zone->zone_zsched->p_pidp);
   5244 
   5245 	/*
   5246 	 * If any threads are scheduled to be placed on zone wait queue they
   5247 	 * should abandon the idea since the wait queue is changing.
   5248 	 * We need to be holding pidlock & p_lock to do this.
   5249 	 */
   5250 	if ((t = pp->p_tlist) != NULL) {
   5251 		do {
   5252 			thread_lock(t);
   5253 			/*
   5254 			 * Kick this thread so that he doesn't sit
   5255 			 * on a wrong wait queue.
   5256 			 */
   5257 			if (ISWAITING(t))
   5258 				setrun_locked(t);
   5259 
   5260 			if (t->t_schedflag & TS_ANYWAITQ)
   5261 				t->t_schedflag &= ~ TS_ANYWAITQ;
   5262 
   5263 			thread_unlock(t);
   5264 		} while ((t = t->t_forw) != pp->p_tlist);
   5265 	}
   5266 
   5267 	/*
   5268 	 * If there is a default scheduling class for the zone and it is not
   5269 	 * the class we are currently in, change all of the threads in the
   5270 	 * process to the new class.  We need to be holding pidlock & p_lock
   5271 	 * when we call parmsset so this is a good place to do it.
   5272 	 */
   5273 	if (zone->zone_defaultcid > 0 &&
   5274 	    zone->zone_defaultcid != curthread->t_cid) {
   5275 		pcparms_t pcparms;
   5276 
   5277 		pcparms.pc_cid = zone->zone_defaultcid;
   5278 		pcparms.pc_clparms[0] = 0;
   5279 
   5280 		/*
   5281 		 * If setting the class fails, we still want to enter the zone.
   5282 		 */
   5283 		if ((t = pp->p_tlist) != NULL) {
   5284 			do {
   5285 				(void) parmsset(&pcparms, t);
   5286 			} while ((t = t->t_forw) != pp->p_tlist);
   5287 		}
   5288 	}
   5289 
   5290 	mutex_exit(&pp->p_lock);
   5291 	mutex_exit(&pidlock);
   5292 
   5293 	mutex_exit(&zonehash_lock);
   5294 	/*
   5295 	 * We're firmly in the zone; let pools progress.
   5296 	 */
   5297 	pool_unlock();
   5298 	task_rele(oldtk);
   5299 	/*
   5300 	 * We don't need to retain a hold on the zone since we already
   5301 	 * incremented zone_ntasks, so the zone isn't going anywhere.
   5302 	 */
   5303 	zone_rele(zone);
   5304 
   5305 	/*
   5306 	 * Chroot
   5307 	 */
   5308 	vp = zone->zone_rootvp;
   5309 	zone_chdir(vp, &PTOU(pp)->u_cdir, pp);
   5310 	zone_chdir(vp, &PTOU(pp)->u_rdir, pp);
   5311 
   5312 	/*
   5313 	 * Change process credentials
   5314 	 */
   5315 	newcr = cralloc();
   5316 	mutex_enter(&pp->p_crlock);
   5317 	cr = pp->p_cred;
   5318 	crcopy_to(cr, newcr);
   5319 	crsetzone(newcr, zone);
   5320 	pp->p_cred = newcr;
   5321 
   5322 	/*
   5323 	 * Restrict all process privilege sets to zone limit
   5324 	 */
   5325 	priv_intersect(zone->zone_privset, &CR_PPRIV(newcr));
   5326 	priv_intersect(zone->zone_privset, &CR_EPRIV(newcr));
   5327 	priv_intersect(zone->zone_privset, &CR_IPRIV(newcr));
   5328 	priv_intersect(zone->zone_privset, &CR_LPRIV(newcr));
   5329 	mutex_exit(&pp->p_crlock);
   5330 	crset(pp, newcr);
   5331 
   5332 	/*
   5333 	 * Adjust upcount to reflect zone entry.
   5334 	 */
   5335 	uid = crgetruid(newcr);
   5336 	mutex_enter(&pidlock);
   5337 	upcount_dec(uid, GLOBAL_ZONEID);
   5338 	upcount_inc(uid, zoneid);
   5339 	mutex_exit(&pidlock);
   5340 
   5341 	/*
   5342 	 * Set up core file path and content.
   5343 	 */
   5344 	set_core_defaults();
   5345 
   5346 out:
   5347 	/*
   5348 	 * Let the other lwps continue.
   5349 	 */
   5350 	mutex_enter(&pp->p_lock);
   5351 	if (curthread != pp->p_agenttp)
   5352 		continuelwps(pp);
   5353 	mutex_exit(&pp->p_lock);
   5354 
   5355 	return (err != 0 ? set_errno(err) : 0);
   5356 }
   5357 
   5358 /*
   5359  * Systemcall entry point for zone_list(2).
   5360  *
   5361  * Processes running in a (non-global) zone only see themselves.
   5362  * On labeled systems, they see all zones whose label they dominate.
   5363  */
   5364 static int
   5365 zone_list(zoneid_t *zoneidlist, uint_t *numzones)
   5366 {
   5367 	zoneid_t *zoneids;
   5368 	zone_t *zone, *myzone;
   5369 	uint_t user_nzones, real_nzones;
   5370 	uint_t domi_nzones;
   5371 	int error;
   5372 
   5373 	if (copyin(numzones, &user_nzones, sizeof (uint_t)) != 0)
   5374 		return (set_errno(EFAULT));
   5375 
   5376 	myzone = curproc->p_zone;
   5377 	if (myzone != global_zone) {
   5378 		bslabel_t *mybslab;
   5379 
   5380 		if (!is_system_labeled()) {
   5381 			/* just return current zone */
   5382 			real_nzones = domi_nzones = 1;
   5383 			zoneids = kmem_alloc(sizeof (zoneid_t), KM_SLEEP);
   5384 			zoneids[0] = myzone->zone_id;
   5385 		} else {
   5386 			/* return all zones that are dominated */
   5387 			mutex_enter(&zonehash_lock);
   5388 			real_nzones = zonecount;
   5389 			domi_nzones = 0;
   5390 			if (real_nzones > 0) {
   5391 				zoneids = kmem_alloc(real_nzones *
   5392 				    sizeof (zoneid_t), KM_SLEEP);
   5393 				mybslab = label2bslabel(myzone->zone_slabel);
   5394 				for (zone = list_head(&zone_active);
   5395 				    zone != NULL;
   5396 				    zone = list_next(&zone_active, zone)) {
   5397 					if (zone->zone_id == GLOBAL_ZONEID)
   5398 						continue;
   5399 					if (zone != myzone &&
   5400 					    (zone->zone_flags & ZF_IS_SCRATCH))
   5401 						continue;
   5402 					/*
   5403 					 * Note that a label always dominates
   5404 					 * itself, so myzone is always included
   5405 					 * in the list.
   5406 					 */
   5407 					if (bldominates(mybslab,
   5408 					    label2bslabel(zone->zone_slabel))) {
   5409 						zoneids[domi_nzones++] =
   5410 						    zone->zone_id;
   5411 					}
   5412 				}
   5413 			}
   5414 			mutex_exit(&zonehash_lock);
   5415 		}
   5416 	} else {
   5417 		mutex_enter(&zonehash_lock);
   5418 		real_nzones = zonecount;
   5419 		domi_nzones = 0;
   5420 		if (real_nzones > 0) {
   5421 			zoneids = kmem_alloc(real_nzones * sizeof (zoneid_t),
   5422 			    KM_SLEEP);
   5423 			for (zone = list_head(&zone_active); zone != NULL;
   5424 			    zone = list_next(&zone_active, zone))
   5425 				zoneids[domi_nzones++] = zone->zone_id;
   5426 			ASSERT(domi_nzones == real_nzones);
   5427 		}
   5428 		mutex_exit(&zonehash_lock);
   5429 	}
   5430 
   5431 	/*
   5432 	 * If user has allocated space for fewer entries than we found, then
   5433 	 * return only up to his limit.  Either way, tell him exactly how many
   5434 	 * we found.
   5435 	 */
   5436 	if (domi_nzones < user_nzones)
   5437 		user_nzones = domi_nzones;
   5438 	error = 0;
   5439 	if (copyout(&domi_nzones, numzones, sizeof (uint_t)) != 0) {
   5440 		error = EFAULT;
   5441 	} else if (zoneidlist != NULL && user_nzones != 0) {
   5442 		if (copyout(zoneids, zoneidlist,
   5443 		    user_nzones * sizeof (zoneid_t)) != 0)
   5444 			error = EFAULT;
   5445 	}
   5446 
   5447 	if (real_nzones > 0)
   5448 		kmem_free(zoneids, real_nzones * sizeof (zoneid_t));
   5449 
   5450 	if (error != 0)
   5451 		return (set_errno(error));
   5452 	else
   5453 		return (0);
   5454 }
   5455 
   5456 /*
   5457  * Systemcall entry point for zone_lookup(2).
   5458  *
   5459  * Non-global zones are only able to see themselves and (on labeled systems)
   5460  * the zones they dominate.
   5461  */
   5462 static zoneid_t
   5463 zone_lookup(const char *zone_name)
   5464 {
   5465 	char *kname;
   5466 	zone_t *zone;
   5467 	zoneid_t zoneid;
   5468 	int err;
   5469 
   5470 	if (zone_name == NULL) {
   5471 		/* return caller's zone id */
   5472 		return (getzoneid());
   5473 	}
   5474 
   5475 	kname = kmem_zalloc(ZONENAME_MAX, KM_SLEEP);
   5476 	if ((err = copyinstr(zone_name, kname, ZONENAME_MAX, NULL)) != 0) {
   5477 		kmem_free(kname, ZONENAME_MAX);
   5478 		return (set_errno(err));
   5479 	}
   5480 
   5481 	mutex_enter(&zonehash_lock);
   5482 	zone = zone_find_all_by_name(kname);
   5483 	kmem_free(kname, ZONENAME_MAX);
   5484 	/*
   5485 	 * In a non-global zone, can only lookup global and own name.
   5486 	 * In Trusted Extensions zone label dominance rules apply.
   5487 	 */
   5488 	if (zone == NULL ||
   5489 	    zone_status_get(zone) < ZONE_IS_READY ||
   5490 	    !zone_list_access(zone)) {
   5491 		mutex_exit(&zonehash_lock);
   5492 		return (set_errno(EINVAL));
   5493 	} else {
   5494 		zoneid = zone->zone_id;
   5495 		mutex_exit(&zonehash_lock);
   5496 		return (zoneid);
   5497 	}
   5498 }
   5499 
   5500 static int
   5501 zone_version(int *version_arg)
   5502 {
   5503 	int version = ZONE_SYSCALL_API_VERSION;
   5504 
   5505 	if (copyout(&version, version_arg, sizeof (int)) != 0)
   5506 		return (set_errno(EFAULT));
   5507 	return (0);
   5508 }
   5509 
   5510 /* ARGSUSED */
   5511 long
   5512 zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
   5513 {
   5514 	zone_def zs;
   5515 	int err;
   5516 
   5517 	switch (cmd) {
   5518 	case ZONE_CREATE:
   5519 		if (get_udatamodel() == DATAMODEL_NATIVE) {
   5520 			if (copyin(arg1, &zs, sizeof (zone_def))) {
   5521 				return (set_errno(EFAULT));
   5522 			}
   5523 		} else {
   5524 #ifdef _SYSCALL32_IMPL
   5525 			zone_def32 zs32;
   5526 
   5527 			if (copyin(arg1, &zs32, sizeof (zone_def32))) {
   5528 				return (set_errno(EFAULT));
   5529 			}
   5530 			zs.zone_name =
   5531 			    (const char *)(unsigned long)zs32.zone_name;
   5532 			zs.zone_root =
   5533 			    (const char *)(unsigned long)zs32.zone_root;
   5534 			zs.zone_privs =
   5535 			    (const struct priv_set *)
   5536 			    (unsigned long)zs32.zone_privs;
   5537 			zs.zone_privssz = zs32.zone_privssz;
   5538 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
   5539 			zs.rctlbufsz = zs32.rctlbufsz;
   5540 			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
   5541 			zs.zfsbufsz = zs32.zfsbufsz;
   5542 			zs.extended_error =
   5543 			    (int *)(unsigned long)zs32.extended_error;
   5544 			zs.match = zs32.match;
   5545 			zs.doi = zs32.doi;
   5546 			zs.label = (const bslabel_t *)(uintptr_t)zs32.label;
   5547 			zs.flags = zs32.flags;
   5548 #else
   5549 			panic("get_udatamodel() returned bogus result\n");
   5550 #endif
   5551 		}
   5552 
   5553 		return (zone_create(zs.zone_name, zs.zone_root,
   5554 		    zs.zone_privs, zs.zone_privssz,
   5555 		    (caddr_t)zs.rctlbuf, zs.rctlbufsz,
   5556 		    (caddr_t)zs.zfsbuf, zs.zfsbufsz,
   5557 		    zs.extended_error, zs.match, zs.doi,
   5558 		    zs.label, zs.flags));
   5559 	case ZONE_BOOT:
   5560 		return (zone_boot((zoneid_t)(uintptr_t)arg1));
   5561 	case ZONE_DESTROY:
   5562 		return (zone_destroy((zoneid_t)(uintptr_t)arg1));
   5563 	case ZONE_GETATTR:
   5564 		return (zone_getattr((zoneid_t)(uintptr_t)arg1,
   5565 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
   5566 	case ZONE_SETATTR:
   5567 		return (zone_setattr((zoneid_t)(uintptr_t)arg1,
   5568 		    (int)(uintptr_t)arg2, arg3, (size_t)arg4));
   5569 	case ZONE_ENTER:
   5570 		return (zone_enter((zoneid_t)(uintptr_t)arg1));
   5571 	case ZONE_LIST:
   5572 		return (zone_list((zoneid_t *)arg1, (uint_t *)arg2));
   5573 	case ZONE_SHUTDOWN:
   5574 		return (zone_shutdown((zoneid_t)(uintptr_t)arg1));
   5575 	case ZONE_LOOKUP:
   5576 		return (zone_lookup((const char *)arg1));
   5577 	case ZONE_VERSION:
   5578 		return (zone_version((int *)arg1));
   5579 	case ZONE_ADD_DATALINK:
   5580 		return (zone_add_datalink((zoneid_t)(uintptr_t)arg1,
   5581 		    (datalink_id_t)(uintptr_t)arg2));
   5582 	case ZONE_DEL_DATALINK:
   5583 		return (zone_remove_datalink((zoneid_t)(uintptr_t)arg1,
   5584 		    (datalink_id_t)(uintptr_t)arg2));
   5585 	case ZONE_CHECK_DATALINK: {
   5586 		zoneid_t	zoneid;
   5587 		boolean_t	need_copyout;
   5588 
   5589 		if (copyin(arg1, &zoneid, sizeof (zoneid)) != 0)
   5590 			return (EFAULT);
   5591 		need_copyout = (zoneid == ALL_ZONES);
   5592 		err = zone_check_datalink(&zoneid,
   5593 		    (datalink_id_t)(uintptr_t)arg2);
   5594 		if (err == 0 && need_copyout) {
   5595 			if (copyout(&zoneid, arg1, sizeof (zoneid)) != 0)
   5596 				err = EFAULT;
   5597 		}
   5598 		return (err == 0 ? 0 : set_errno(err));
   5599 	}
   5600 	case ZONE_LIST_DATALINK:
   5601 		return (zone_list_datalink((zoneid_t)(uintptr_t)arg1,
   5602 		    (int *)arg2, (datalink_id_t *)(uintptr_t)arg3));
   5603 	default:
   5604 		return (set_errno(EINVAL));
   5605 	}
   5606 }
   5607 
   5608 struct zarg {
   5609 	zone_t *zone;
   5610 	zone_cmd_arg_t arg;
   5611 };
   5612 
   5613 static int
   5614 zone_lookup_door(const char *zone_name, door_handle_t *doorp)
   5615 {
   5616 	char *buf;
   5617 	size_t buflen;
   5618 	int error;
   5619 
   5620 	buflen = sizeof (ZONE_DOOR_PATH) + strlen(zone_name);
   5621 	buf = kmem_alloc(buflen, KM_SLEEP);
   5622 	(void) snprintf(buf, buflen, ZONE_DOOR_PATH, zone_name);
   5623 	error = door_ki_open(buf, doorp);
   5624 	kmem_free(buf, buflen);
   5625 	return (error);
   5626 }
   5627 
   5628 static void
   5629 zone_release_door(door_handle_t *doorp)
   5630 {
   5631 	door_ki_rele(*doorp);
   5632 	*doorp = NULL;
   5633 }
   5634 
   5635 static void
   5636 zone_ki_call_zoneadmd(struct zarg *zargp)
   5637 {
   5638 	door_handle_t door = NULL;
   5639 	door_arg_t darg, save_arg;
   5640 	char *zone_name;
   5641 	size_t zone_namelen;
   5642 	zoneid_t zoneid;
   5643 	zone_t *zone;
   5644 	zone_cmd_arg_t arg;
   5645 	uint64_t uniqid;
   5646 	size_t size;
   5647 	int error;
   5648 	int retry;
   5649 
   5650 	zone = zargp->zone;
   5651 	arg = zargp->arg;
   5652 	kmem_free(zargp, sizeof (*zargp));
   5653 
   5654 	zone_namelen = strlen(zone->zone_name) + 1;
   5655 	zone_name = kmem_alloc(zone_namelen, KM_SLEEP);
   5656 	bcopy(zone->zone_name, zone_name, zone_namelen);
   5657 	zoneid = zone->zone_id;
   5658 	uniqid = zone->zone_uniqid;
   5659 	/*
   5660 	 * zoneadmd may be down, but at least we can empty out the zone.
   5661 	 * We can ignore the return value of zone_empty() since we're called
   5662 	 * from a kernel thread and know we won't be delivered any signals.
   5663 	 */
   5664 	ASSERT(curproc == &p0);
   5665 	(void) zone_empty(zone);
   5666 	ASSERT(zone_status_get(zone) >= ZONE_IS_EMPTY);
   5667 	zone_rele(zone);
   5668 
   5669 	size = sizeof (arg);
   5670 	darg.rbuf = (char *)&arg;
   5671 	darg.data_ptr = (char *)&arg;
   5672 	darg.rsize = size;
   5673 	darg.data_size = size;
   5674 	darg.desc_ptr = NULL;
   5675 	darg.desc_num = 0;
   5676 
   5677 	save_arg = darg;
   5678 	/*
   5679 	 * Since we're not holding a reference to the zone, any number of
   5680 	 * things can go wrong, including the zone disappearing before we get a
   5681 	 * chance to talk to zoneadmd.
   5682 	 */
   5683 	for (retry = 0; /* forever */; retry++) {
   5684 		if (door == NULL &&
   5685 		    (error = zone_lookup_door(zone_name, &door)) != 0) {
   5686 			goto next;
   5687 		}
   5688 		ASSERT(door != NULL);
   5689 
   5690 		if ((error = door_ki_upcall_limited(door, &darg, NULL,
   5691 		    SIZE_MAX, 0)) == 0) {
   5692 			break;
   5693 		}
   5694 		switch (error) {
   5695 		case EINTR:
   5696 			/* FALLTHROUGH */
   5697 		case EAGAIN:	/* process may be forking */
   5698 			/*
   5699 			 * Back off for a bit
   5700 			 */
   5701 			break;
   5702 		case EBADF:
   5703 			zone_release_door(&door);
   5704 			if (zone_lookup_door(zone_name, &door) != 0) {
   5705 				/*
   5706 				 * zoneadmd may be dead, but it may come back to
   5707 				 * life later.
   5708 				 */
   5709 				break;
   5710 			}
   5711 			break;
   5712 		default:
   5713 			cmn_err(CE_WARN,
   5714 			    "zone_ki_call_zoneadmd: door_ki_upcall error %d\n",
   5715 			    error);
   5716 			goto out;
   5717 		}
   5718 next:
   5719 		/*
   5720 		 * If this isn't the same zone_t that we originally had in mind,
   5721 		 * then this is the same as if two kadmin requests come in at
   5722 		 * the same time: the first one wins.  This means we lose, so we
   5723 		 * bail.
   5724 		 */
   5725 		if ((zone = zone_find_by_id(zoneid)) == NULL) {
   5726 			/*
   5727 			 * Problem is solved.
   5728 			 */
   5729 			break;
   5730 		}
   5731 		if (zone->zone_uniqid != uniqid) {
   5732 			/*
   5733 			 * zoneid recycled
   5734 			 */
   5735 			zone_rele(zone);
   5736 			break;
   5737 		}
   5738 		/*
   5739 		 * We could zone_status_timedwait(), but there doesn't seem to
   5740 		 * be much point in doing that (plus, it would mean that
   5741 		 * zone_free() isn't called until this thread exits).
   5742 		 */
   5743 		zone_rele(zone);
   5744 		delay(hz);
   5745 		darg = save_arg;
   5746 	}
   5747 out:
   5748 	if (door != NULL) {
   5749 		zone_release_door(&door);
   5750 	}
   5751 	kmem_free(zone_name, zone_namelen);
   5752 	thread_exit();
   5753 }
   5754 
   5755 /*
   5756  * Entry point for uadmin() to tell the zone to go away or reboot.  Analog to
   5757  * kadmin().  The caller is a process in the zone.
   5758  *
   5759  * In order to shutdown the zone, we will hand off control to zoneadmd
   5760  * (running in the global zone) via a door.  We do a half-hearted job at
   5761  * killing all processes in the zone, create a kernel thread to contact
   5762  * zoneadmd, and make note of the "uniqid" of the zone.  The uniqid is
   5763  * a form of generation number used to let zoneadmd (as well as
   5764  * zone_destroy()) know exactly which zone they're re talking about.
   5765  */
   5766 int
   5767 zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp)
   5768 {
   5769 	struct zarg *zargp;
   5770 	zone_cmd_t zcmd;
   5771 	zone_t *zone;
   5772 
   5773 	zone = curproc->p_zone;
   5774 	ASSERT(getzoneid() != GLOBAL_ZONEID);
   5775 
   5776 	switch (cmd) {
   5777 	case A_SHUTDOWN:
   5778 		switch (fcn) {
   5779 		case AD_HALT:
   5780 		case AD_POWEROFF:
   5781 			zcmd = Z_HALT;
   5782 			break;
   5783 		case AD_BOOT:
   5784 			zcmd = Z_REBOOT;
   5785 			break;
   5786 		case AD_IBOOT:
   5787 		case AD_SBOOT:
   5788 		case AD_SIBOOT:
   5789 		case AD_NOSYNC:
   5790 			return (ENOTSUP);
   5791 		default:
   5792 			return (EINVAL);
   5793 		}
   5794 		break;
   5795 	case A_REBOOT:
   5796 		zcmd = Z_REBOOT;
   5797 		break;
   5798 	case A_FTRACE:
   5799 	case A_REMOUNT:
   5800 	case A_FREEZE:
   5801 	case A_DUMP:
   5802 	case A_CONFIG:
   5803 		return (ENOTSUP);
   5804 	default:
   5805 		ASSERT(cmd != A_SWAPCTL);	/* handled by uadmin() */
   5806 		return (EINVAL);
   5807 	}
   5808 
   5809 	if (secpolicy_zone_admin(credp, B_FALSE))
   5810 		return (EPERM);
   5811 	mutex_enter(&zone_status_lock);
   5812 
   5813 	/*
   5814 	 * zone_status can't be ZONE_IS_EMPTY or higher since curproc
   5815 	 * is in the zone.
   5816 	 */
   5817 	ASSERT(zone_status_get(zone) < ZONE_IS_EMPTY);
   5818 	if (zone_status_get(zone) > ZONE_IS_RUNNING) {
   5819 		/*
   5820 		 * This zone is already on its way down.
   5821 		 */
   5822 		mutex_exit(&zone_status_lock);
   5823 		return (0);
   5824 	}
   5825 	/*
   5826 	 * Prevent future zone_enter()s
   5827 	 */
   5828 	zone_status_set(zone, ZONE_IS_SHUTTING_DOWN);
   5829 	mutex_exit(&zone_status_lock);
   5830 
   5831 	/*
   5832 	 * Kill everyone now and call zoneadmd later.
   5833 	 * zone_ki_call_zoneadmd() will do a more thorough job of this
   5834 	 * later.
   5835 	 */
   5836 	killall(zone->zone_id);
   5837 	/*
   5838 	 * Now, create the thread to contact zoneadmd and do the rest of the
   5839 	 * work.  This thread can't be created in our zone otherwise
   5840 	 * zone_destroy() would deadlock.
   5841 	 */
   5842 	zargp = kmem_zalloc(sizeof (*zargp), KM_SLEEP);
   5843 	zargp->arg.cmd = zcmd;
   5844 	zargp->arg.uniqid = zone->zone_uniqid;
   5845 	zargp->zone = zone;
   5846 	(void) strcpy(zargp->arg.locale, "C");
   5847 	/* mdep was already copied in for us by uadmin */
   5848 	if (mdep != NULL)
   5849 		(void) strlcpy(zargp->arg.bootbuf, mdep,
   5850 		    sizeof (zargp->arg.bootbuf));
   5851 	zone_hold(zone);
   5852 
   5853 	(void) thread_create(NULL, 0, zone_ki_call_zoneadmd, zargp, 0, &p0,
   5854 	    TS_RUN, minclsyspri);
   5855 	exit(CLD_EXITED, 0);
   5856 
   5857 	return (EINVAL);
   5858 }
   5859 
   5860 /*
   5861  * Entry point so kadmin(A_SHUTDOWN, ...) can set the global zone's
   5862  * status to ZONE_IS_SHUTTING_DOWN.
   5863  *
   5864  * This function also shuts down all running zones to ensure that they won't
   5865  * fork new processes.
   5866  */
   5867 void
   5868 zone_shutdown_global(void)
   5869 {
   5870 	zone_t *current_zonep;
   5871 
   5872 	ASSERT(INGLOBALZONE(curproc));
   5873 	mutex_enter(&zonehash_lock);
   5874 	mutex_enter(&zone_status_lock);
   5875 
   5876 	/* Modify the global zone's status first. */
   5877 	ASSERT(zone_status_get(global_zone) == ZONE_IS_RUNNING);
   5878 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
   5879 
   5880 	/*
   5881 	 * Now change the states of all running zones to ZONE_IS_SHUTTING_DOWN.
   5882 	 * We don't mark all zones with ZONE_IS_SHUTTING_DOWN because doing so
   5883 	 * could cause assertions to fail (e.g., assertions about a zone's
   5884 	 * state during initialization, readying, or booting) or produce races.
   5885 	 * We'll let threads continue to initialize and ready new zones: they'll
   5886 	 * fail to boot the new zones when they see that the global zone is
   5887 	 * shutting down.
   5888 	 */
   5889 	for (current_zonep = list_head(&zone_active); current_zonep != NULL;
   5890 	    current_zonep = list_next(&zone_active, current_zonep)) {
   5891 		if (zone_status_get(current_zonep) == ZONE_IS_RUNNING)
   5892 			zone_status_set(current_zonep, ZONE_IS_SHUTTING_DOWN);
   5893 	}
   5894 	mutex_exit(&zone_status_lock);
   5895 	mutex_exit(&zonehash_lock);
   5896 }
   5897 
   5898 /*
   5899  * Returns true if the named dataset is visible in the current zone.
   5900  * The 'write' parameter is set to 1 if the dataset is also writable.
   5901  */
   5902 int
   5903 zone_dataset_visible(const char *dataset, int *write)
   5904 {
   5905 	zone_dataset_t *zd;
   5906 	size_t len;
   5907 	zone_t *zone = curproc->p_zone;
   5908 
   5909 	if (dataset[0] == '\0')
   5910 		return (0);
   5911 
   5912 	/*
   5913 	 * Walk the list once, looking for datasets which match exactly, or
   5914 	 * specify a dataset underneath an exported dataset.  If found, return
   5915 	 * true and note that it is writable.
   5916 	 */
   5917 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
   5918 	    zd = list_next(&zone->zone_datasets, zd)) {
   5919 
   5920 		len = strlen(zd->zd_dataset);
   5921 		if (strlen(dataset) >= len &&
   5922 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
   5923 		    (dataset[len] == '\0' || dataset[len] == '/' ||
   5924 		    dataset[len] == '@')) {
   5925 			if (write)
   5926 				*write = 1;
   5927 			return (1);
   5928 		}
   5929 	}
   5930 
   5931 	/*
   5932 	 * Walk the list a second time, searching for datasets which are parents
   5933 	 * of exported datasets.  These should be visible, but read-only.
   5934 	 *
   5935 	 * Note that we also have to support forms such as 'pool/dataset/', with
   5936 	 * a trailing slash.
   5937 	 */
   5938 	for (zd = list_head(&zone->zone_datasets); zd != NULL;
   5939 	    zd = list_next(&zone->zone_datasets, zd)) {
   5940 
   5941 		len = strlen(dataset);
   5942 		if (dataset[len - 1] == '/')
   5943 			len--;	/* Ignore trailing slash */
   5944 		if (len < strlen(zd->zd_dataset) &&
   5945 		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
   5946 		    zd->zd_dataset[len] == '/') {
   5947 			if (write)
   5948 				*write = 0;
   5949 			return (1);
   5950 		}
   5951 	}
   5952 
   5953 	return (0);
   5954 }
   5955 
   5956 /*
   5957  * zone_find_by_any_path() -
   5958  *
   5959  * kernel-private routine similar to zone_find_by_path(), but which
   5960  * effectively compares against zone paths rather than zonerootpath
   5961  * (i.e., the last component of zonerootpaths, which should be "root/",
   5962  * are not compared.)  This is done in order to accurately identify all
   5963  * paths, whether zone-visible or not, including those which are parallel
   5964  * to /root/, such as /dev/, /home/, etc...
   5965  *
   5966  * If the specified path does not fall under any zone path then global
   5967  * zone is returned.
   5968  *
   5969  * The treat_abs parameter indicates whether the path should be treated as
   5970  * an absolute path although it does not begin with "/".  (This supports
   5971  * nfs mount syntax such as host:any/path.)
   5972  *
   5973  * The caller is responsible for zone_rele of the returned zone.
   5974  */
   5975 zone_t *
   5976 zone_find_by_any_path(const char *path, boolean_t treat_abs)
   5977 {
   5978 	zone_t *zone;
   5979 	int path_offset = 0;
   5980 
   5981 	if (path == NULL) {
   5982 		zone_hold(global_zone);
   5983 		return (global_zone);
   5984 	}
   5985 
   5986 	if (*path != '/') {
   5987 		ASSERT(treat_abs);
   5988 		path_offset = 1;
   5989 	}
   5990 
   5991 	mutex_enter(&zonehash_lock);
   5992 	for (zone = list_head(&zone_active); zone != NULL;
   5993 	    zone = list_next(&zone_active, zone)) {
   5994 		char	*c;
   5995 		size_t	pathlen;
   5996 		char *rootpath_start;
   5997 
   5998 		if (zone == global_zone)	/* skip global zone */
   5999 			continue;
   6000 
   6001 		/* scan backwards to find start of last component */
   6002 		c = zone->zone_rootpath + zone->zone_rootpathlen - 2;
   6003 		do {
   6004 			c--;
   6005 		} while (*c != '/');
   6006 
   6007 		pathlen = c - zone->zone_rootpath + 1 - path_offset;
   6008 		rootpath_start = (zone->zone_rootpath + path_offset);
   6009 		if (strncmp(path, rootpath_start, pathlen) == 0)
   6010 			break;
   6011 	}
   6012 	if (zone == NULL)
   6013 		zone = global_zone;
   6014 	zone_hold(zone);
   6015 	mutex_exit(&zonehash_lock);
   6016 	return (zone);
   6017 }
   6018 
   6019 /*
   6020  * Finds a zone_dl_t with the given linkid in the given zone.  Returns the
   6021  * zone_dl_t pointer if found, and NULL otherwise.
   6022  */
   6023 static zone_dl_t *
   6024 zone_find_dl(zone_t *zone, datalink_id_t linkid)
   6025 {
   6026 	zone_dl_t *zdl;
   6027 
   6028 	ASSERT(mutex_owned(&zone->zone_lock));
   6029 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
   6030 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
   6031 		if (zdl->zdl_id == linkid)
   6032 			break;
   6033 	}
   6034 	return (zdl);
   6035 }
   6036 
   6037 static boolean_t
   6038 zone_dl_exists(zone_t *zone, datalink_id_t linkid)
   6039 {
   6040 	boolean_t exists;
   6041 
   6042 	mutex_enter(&zone->zone_lock);
   6043 	exists = (zone_find_dl(zone, linkid) != NULL);
   6044 	mutex_exit(&zone->zone_lock);
   6045 	return (exists);
   6046 }
   6047 
   6048 /*
   6049  * Add an data link name for the zone.
   6050  */
   6051 static int
   6052 zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid)
   6053 {
   6054 	zone_dl_t *zdl;
   6055 	zone_t *zone;
   6056 	zone_t *thiszone;
   6057 
   6058 	if ((thiszone = zone_find_by_id(zoneid)) == NULL)
   6059 		return (set_errno(ENXIO));
   6060 
   6061 	/* Verify that the datalink ID doesn't already belong to a zone. */
   6062 	mutex_enter(&zonehash_lock);
   6063 	for (zone = list_head(&zone_active); zone != NULL;
   6064 	    zone = list_next(&zone_active, zone)) {
   6065 		if (zone_dl_exists(zone, linkid)) {
   6066 			mutex_exit(&zonehash_lock);
   6067 			zone_rele(thiszone);
   6068 			return (set_errno((zone == thiszone) ? EEXIST : EPERM));
   6069 		}
   6070 	}
   6071 
   6072 	zdl = kmem_zalloc(sizeof (*zdl), KM_SLEEP);
   6073 	zdl->zdl_id = linkid;
   6074 	mutex_enter(&thiszone->zone_lock);
   6075 	list_insert_head(&thiszone->zone_dl_list, zdl);
   6076 	mutex_exit(&thiszone->zone_lock);
   6077 	mutex_exit(&zonehash_lock);
   6078 	zone_rele(thiszone);
   6079 	return (0);
   6080 }
   6081 
   6082 static int
   6083 zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid)
   6084 {
   6085 	zone_dl_t *zdl;
   6086 	zone_t *zone;
   6087 	int err = 0;
   6088 
   6089 	if ((zone = zone_find_by_id(zoneid)) == NULL)
   6090 		return (set_errno(EINVAL));
   6091 
   6092 	mutex_enter(&zone->zone_lock);
   6093 	if ((zdl = zone_find_dl(zone, linkid)) == NULL) {
   6094 		err = ENXIO;
   6095 	} else {
   6096 		list_remove(&zone->zone_dl_list, zdl);
   6097 		kmem_free(zdl, sizeof (zone_dl_t));
   6098 	}
   6099 	mutex_exit(&zone->zone_lock);
   6100 	zone_rele(zone);
   6101 	return (err == 0 ? 0 : set_errno(err));
   6102 }
   6103 
   6104 /*
   6105  * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned
   6106  * the linkid.  Otherwise we just check if the specified zoneidp has been
   6107  * assigned the supplied linkid.
   6108  */
   6109 int
   6110 zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid)
   6111 {
   6112 	zone_t *zone;
   6113 	int err = ENXIO;
   6114 
   6115 	if (*zoneidp != ALL_ZONES) {
   6116 		if ((zone = zone_find_by_id(*zoneidp)) != NULL) {
   6117 			if (zone_dl_exists(zone, linkid))
   6118 				err = 0;
   6119 			zone_rele(zone);
   6120 		}
   6121 		return (err);
   6122 	}
   6123 
   6124 	mutex_enter(&zonehash_lock);
   6125 	for (zone = list_head(&zone_active); zone != NULL;
   6126 	    zone = list_next(&zone_active, zone)) {
   6127 		if (zone_dl_exists(zone, linkid)) {
   6128 			*zoneidp = zone->zone_id;
   6129 			err = 0;
   6130 			break;
   6131 		}
   6132 	}
   6133 	mutex_exit(&zonehash_lock);
   6134 	return (err);
   6135 }
   6136 
   6137 /*
   6138  * Get the list of datalink IDs assigned to a zone.
   6139  *
   6140  * On input, *nump is the number of datalink IDs that can fit in the supplied
   6141  * idarray.  Upon return, *nump is either set to the number of datalink IDs
   6142  * that were placed in the array if the array was large enough, or to the
   6143  * number of datalink IDs that the function needs to place in the array if the
   6144  * array is too small.
   6145  */
   6146 static int
   6147 zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray)
   6148 {
   6149 	uint_t num, dlcount;
   6150 	zone_t *zone;
   6151 	zone_dl_t *zdl;
   6152 	datalink_id_t *idptr = idarray;
   6153 
   6154 	if (copyin(nump, &dlcount, sizeof (dlcount)) != 0)
   6155 		return (set_errno(EFAULT));
   6156 	if ((zone = zone_find_by_id(zoneid)) == NULL)
   6157 		return (set_errno(ENXIO));
   6158 
   6159 	num = 0;
   6160 	mutex_enter(&zone->zone_lock);
   6161 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
   6162 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
   6163 		/*
   6164 		 * If the list is bigger than what the caller supplied, just
   6165 		 * count, don't do copyout.
   6166 		 */
   6167 		if (++num > dlcount)
   6168 			continue;
   6169 		if (copyout(&zdl->zdl_id, idptr, sizeof (*idptr)) != 0) {
   6170 			mutex_exit(&zone->zone_lock);
   6171 			zone_rele(zone);
   6172 			return (set_errno(EFAULT));
   6173 		}
   6174 		idptr++;
   6175 	}
   6176 	mutex_exit(&zone->zone_lock);
   6177 	zone_rele(zone);
   6178 
   6179 	/* Increased or decreased, caller should be notified. */
   6180 	if (num != dlcount) {
   6181 		if (copyout(&num, nump, sizeof (num)) != 0)
   6182 			return (set_errno(EFAULT));
   6183 	}
   6184 	return (0);
   6185 }
   6186 
   6187 /*
   6188  * Public interface for looking up a zone by zoneid. It's a customized version
   6189  * for netstack_zone_create(). It can only be called from the zsd create
   6190  * callbacks, since it doesn't have reference on the zone structure hence if
   6191  * it is called elsewhere the zone could disappear after the zonehash_lock
   6192  * is dropped.
   6193  *
   6194  * Furthermore it
   6195  * 1. Doesn't check the status of the zone.
   6196  * 2. It will be called even before zone_init is called, in that case the
   6197  *    address of zone0 is returned directly, and netstack_zone_create()
   6198  *    will only assign a value to zone0.zone_netstack, won't break anything.
   6199  * 3. Returns without the zone being held.
   6200  */
   6201 zone_t *
   6202 zone_find_by_id_nolock(zoneid_t zoneid)
   6203 {
   6204 	zone_t *zone;
   6205 
   6206 	mutex_enter(&zonehash_lock);
   6207 	if (zonehashbyid == NULL)
   6208 		zone = &zone0;
   6209 	else
   6210 		zone = zone_find_all_by_id(zoneid);
   6211 	mutex_exit(&zonehash_lock);
   6212 	return (zone);
   6213 }
   6214 
   6215 /*
   6216  * Walk the datalinks for a given zone
   6217  */
   6218 int
   6219 zone_datalink_walk(zoneid_t zoneid, int (*cb)(datalink_id_t, void *),
   6220     void *data)
   6221 {
   6222 	zone_t		*zone;
   6223 	zone_dl_t	*zdl;
   6224 	datalink_id_t	*idarray;
   6225 	uint_t		idcount = 0;
   6226 	int		i, ret = 0;
   6227 
   6228 	if ((zone = zone_find_by_id(zoneid)) == NULL)
   6229 		return (ENOENT);
   6230 
   6231 	/*
   6232 	 * We first build an array of linkid's so that we can walk these and
   6233 	 * execute the callback with the zone_lock dropped.
   6234 	 */
   6235 	mutex_enter(&zone->zone_lock);
   6236 	for (zdl = list_head(&zone->zone_dl_list); zdl != NULL;
   6237 	    zdl = list_next(&zone->zone_dl_list, zdl)) {
   6238 		idcount++;
   6239 	}
   6240 
   6241 	if (idcount == 0) {
   6242 		mutex_exit(&zone->zone_lock);
   6243 		zone_rele(zone);
   6244 		return (0);
   6245 	}
   6246 
   6247 	idarray = kmem_alloc(sizeof (datalink_id_t) * idcount, KM_NOSLEEP);
   6248 	if (idarray == NULL) {
   6249 		mutex_exit(&zone->zone_lock);
   6250 		zone_rele(zone);
   6251 		return (ENOMEM);
   6252 	}
   6253 
   6254 	for (i = 0, zdl = list_head(&zone->zone_dl_list); zdl != NULL;
   6255 	    i++, zdl = list_next(&zone->zone_dl_list, zdl)) {
   6256 		idarray[i] = zdl->zdl_id;
   6257 	}
   6258 
   6259 	mutex_exit(&zone->zone_lock);
   6260 
   6261 	for (i = 0; i < idcount && ret == 0; i++) {
   6262 		if ((ret = (*cb)(idarray[i], data)) != 0)
   6263 			break;
   6264 	}
   6265 
   6266 	zone_rele(zone);
   6267 	kmem_free(idarray, sizeof (datalink_id_t) * idcount);
   6268 	return (ret);
   6269 }
   6270