Home | History | Annotate | Download | only in rpc
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  * Copyright 1993 OpenVision Technologies, Inc., All Rights Reserved.
     29  */
     30 
     31 /*	Copyright (c) 1983, 1984, 1985,  1986, 1987, 1988, 1989 AT&T	*/
     32 /*	  All Rights Reserved  	*/
     33 
     34 /*
     35  * Portions of this source code were derived from Berkeley 4.3 BSD
     36  * under license from the Regents of the University of California.
     37  */
     38 
     39 #pragma ident	"@(#)svc.c	1.104	07/06/15 SMI"
     40 
     41 /*
     42  * Server-side remote procedure call interface.
     43  *
     44  * Master transport handle (SVCMASTERXPRT).
     45  *   The master transport handle structure is shared among service
     46  *   threads processing events on the transport. Some fields in the
     47  *   master structure are protected by locks
     48  *   - xp_req_lock protects the request queue:
     49  *	xp_req_head, xp_req_tail
     50  *   - xp_thread_lock protects the thread (clone) counts
     51  *	xp_threads, xp_detached_threads, xp_wq
     52  *   Each master transport is registered to exactly one thread pool.
     53  *
     54  * Clone transport handle (SVCXPRT)
     55  *   The clone transport handle structure is a per-service-thread handle
     56  *   to the transport. The structure carries all the fields/buffers used
     57  *   for request processing. A service thread or, in other words, a clone
     58  *   structure, can be linked to an arbitrary master structure to process
     59  *   requests on this transport. The master handle keeps track of reference
     60  *   counts of threads (clones) linked to it. A service thread can switch
     61  *   to another transport by unlinking its clone handle from the current
     62  *   transport and linking to a new one. Switching is relatively inexpensive
     63  *   but it involves locking (master's xprt->xp_thread_lock).
     64  *
     65  * Pools.
     66  *   A pool represents a kernel RPC service (NFS, Lock Manager, etc.).
     67  *   Transports related to the service are registered to the service pool.
     68  *   Service threads can switch between different transports in the pool.
     69  *   Thus, each service has its own pool of service threads. The maximum
     70  *   number of threads in a pool is pool->p_maxthreads. This limit allows
     71  *   to restrict resource usage by the service. Some fields are protected
     72  *   by locks:
     73  *   - p_req_lock protects several counts and flags:
     74  *	p_reqs, p_walkers, p_asleep, p_drowsy, p_req_cv
     75  *   - p_thread_lock governs other thread counts:
     76  *	p_threads, p_detached_threads, p_reserved_threads, p_closing
     77  *
     78  *   In addition, each pool contains a doubly-linked list of transports,
     79  *   an `xprt-ready' queue and a creator thread (see below). Threads in
     80  *   the pool share some other parameters such as stack size and
     81  *   polling timeout.
     82  *
     83  *   Pools are initialized through the svc_pool_create() function called from
     84  *   the nfssys() system call. However, thread creation must be done by
     85  *   the userland agent. This is done by using SVCPOOL_WAIT and
     86  *   SVCPOOL_RUN arguments to nfssys(), which call svc_wait() and
     87  *   svc_do_run(), respectively. Once the pool has been initialized,
     88  *   the userland process must set up a 'creator' thread. This thread
     89  *   should park itself in the kernel by calling svc_wait(). If
     90  *   svc_wait() returns successfully, it should fork off a new worker
     91  *   thread, which then calls svc_do_run() in order to get work. When
     92  *   that thread is complete, svc_do_run() will return, and the user
     93  *   program should call thr_exit().
     94  *
     95  *   When we try to register a new pool and there is an old pool with
     96  *   the same id in the doubly linked pool list (this happens when we kill
     97  *   and restart nfsd or lockd), then we unlink the old pool from the list
     98  *   and mark its state as `closing'. After that the transports can still
     99  *   process requests but new transports won't be registered. When all the
    100  *   transports and service threads associated with the pool are gone the
    101  *   creator thread (see below) will clean up the pool structure and exit.
    102  *
    103  * svc_queuereq() and svc_run().
    104  *   The kernel RPC server is interrupt driven. The svc_queuereq() interrupt
    105  *   routine is called to deliver an RPC request. The service threads
    106  *   loop in svc_run(). The interrupt function queues a request on the
    107  *   transport's queue and it makes sure that the request is serviced.
    108  *   It may either wake up one of sleeping threads, or ask for a new thread
    109  *   to be created, or, if the previous request is just being picked up, do
    110  *   nothing. In the last case the service thread that is picking up the
    111  *   previous request will wake up or create the next thread. After a service
    112  *   thread processes a request and sends a reply it returns to svc_run()
    113  *   and svc_run() calls svc_poll() to find new input.
    114  *
    115  *   There is no longer an "inconsistent" but "safe" optimization in the
    116  *   svc_queuereq() code. This "inconsistent" state was leading to
    117  *   inconsistencies between the actual number of requests and the value
    118  *   of p_reqs (the total number of requests). Because of this, hangs were
    119  *   occurring in svc_poll() where p_reqs was greater than one and no
    120  *   requests were found on the request queues.
    121  *
    122  * svc_poll().
    123  *   In order to avoid unnecessary locking, which causes performance
    124  *   problems, we always look for a pending request on the current transport.
    125  *   If there is none we take a hint from the pool's `xprt-ready' queue.
    126  *   If the queue had an overflow we switch to the `drain' mode checking
    127  *   each transport  in the pool's transport list. Once we find a
    128  *   master transport handle with a pending request we latch the request
    129  *   lock on this transport and return to svc_run(). If the request
    130  *   belongs to a transport different than the one the service thread is
    131  *   linked to we need to unlink and link again.
    132  *
    133  *   A service thread goes asleep when there are no pending
    134  *   requests on the transports registered on the pool's transports.
    135  *   All the pool's threads sleep on the same condition variable.
    136  *   If a thread has been sleeping for too long period of time
    137  *   (by default 5 seconds) it wakes up and exits.  Also when a transport
    138  *   is closing sleeping threads wake up to unlink from this transport.
    139  *
    140  * The `xprt-ready' queue.
    141  *   If a service thread finds no request on a transport it is currently linked
    142  *   to it will find another transport with a pending request. To make
    143  *   this search more efficient each pool has an `xprt-ready' queue.
    144  *   The queue is a FIFO. When the interrupt routine queues a request it also
    145  *   inserts a pointer to the transport into the `xprt-ready' queue. A
    146  *   thread looking for a transport with a pending request can pop up a
    147  *   transport and check for a request. The request can be already gone
    148  *   since it could be taken by a thread linked to that transport. In such a
    149  *   case we try the next hint. The `xprt-ready' queue has fixed size (by
    150  *   default 256 nodes). If it overflows svc_poll() has to switch to the
    151  *   less efficient but safe `drain' mode and walk through the pool's
    152  *   transport list.
    153  *
    154  *   Both the svc_poll() loop and the `xprt-ready' queue are optimized
    155  *   for the peak load case that is for the situation when the queue is not
    156  *   empty, there are all the time few pending requests, and a service
    157  *   thread which has just processed a request does not go asleep but picks
    158  *   up immediately the next request.
    159  *
    160  * Thread creator.
    161  *   Each pool has a thread creator associated with it. The creator thread
    162  *   sleeps on a condition variable and waits for a signal to create a
    163  *   service thread. The actual thread creation is done in userland by
    164  *   the method described in "Pools" above.
    165  *
    166  *   Signaling threads should turn on the `creator signaled' flag, and
    167  *   can avoid sending signals when the flag is on. The flag is cleared
    168  *   when the thread is created.
    169  *
    170  *   When the pool is in closing state (ie it has been already unregistered
    171  *   from the pool list) the last thread on the last transport in the pool
    172  *   should turn the p_creator_exit flag on. The creator thread will
    173  *   clean up the pool structure and exit.
    174  *
    175  * Thread reservation; Detaching service threads.
    176  *   A service thread can detach itself to block for an extended amount
    177  *   of time. However, to keep the service active we need to guarantee
    178  *   at least pool->p_redline non-detached threads that can process incoming
    179  *   requests. This, the maximum number of detached and reserved threads is
    180  *   p->p_maxthreads - p->p_redline. A service thread should first acquire
    181  *   a reservation, and if the reservation was granted it can detach itself.
    182  *   If a reservation was granted but the thread does not detach itself
    183  *   it should cancel the reservation before it returns to svc_run().
    184  */
    185 
    186 #include <sys/param.h>
    187 #include <sys/types.h>
    188 #include <rpc/types.h>
    189 #include <sys/socket.h>
    190 #include <sys/time.h>
    191 #include <sys/tiuser.h>
    192 #include <sys/t_kuser.h>
    193 #include <netinet/in.h>
    194 #include <rpc/xdr.h>
    195 #include <rpc/auth.h>
    196 #include <rpc/clnt.h>
    197 #include <rpc/rpc_msg.h>
    198 #include <rpc/svc.h>
    199 #include <sys/proc.h>
    200 #include <sys/user.h>
    201 #include <sys/stream.h>
    202 #include <sys/strsubr.h>
    203 #include <sys/tihdr.h>
    204 #include <sys/debug.h>
    205 #include <sys/cmn_err.h>
    206 #include <sys/file.h>
    207 #include <sys/systm.h>
    208 #include <sys/callb.h>
    209 #include <sys/vtrace.h>
    210 #include <sys/zone.h>
    211 #include <nfs/nfs.h>
    212 #include <sys/tsol/label_macro.h>
    213 
    214 #define	RQCRED_SIZE	400	/* this size is excessive */
    215 
    216 /*
    217  * Defines for svc_poll()
    218  */
    219 #define	SVC_EXPRTGONE ((SVCMASTERXPRT *)1)	/* Transport is closing */
    220 #define	SVC_ETIMEDOUT ((SVCMASTERXPRT *)2)	/* Timeout */
    221 #define	SVC_EINTR ((SVCMASTERXPRT *)3)		/* Interrupted by signal */
    222 
    223 /*
    224  * Default stack size for service threads.
    225  */
    226 #define	DEFAULT_SVC_RUN_STKSIZE		(0)	/* default kernel stack */
    227 
    228 int    svc_default_stksize = DEFAULT_SVC_RUN_STKSIZE;
    229 
    230 /*
    231  * Default polling timeout for service threads.
    232  * Multiplied by hz when used.
    233  */
    234 #define	DEFAULT_SVC_POLL_TIMEOUT	(5)	/* seconds */
    235 
    236 clock_t svc_default_timeout = DEFAULT_SVC_POLL_TIMEOUT;
    237 
    238 /*
    239  * Size of the `xprt-ready' queue.
    240  */
    241 #define	DEFAULT_SVC_QSIZE		(256)	/* qnodes */
    242 
    243 size_t svc_default_qsize = DEFAULT_SVC_QSIZE;
    244 
    245 /*
    246  * Default limit for the number of service threads.
    247  */
    248 #define	DEFAULT_SVC_MAXTHREADS		(INT16_MAX)
    249 
    250 int    svc_default_maxthreads = DEFAULT_SVC_MAXTHREADS;
    251 
    252 /*
    253  * Maximum number of requests from the same transport (in `drain' mode).
    254  */
    255 #define	DEFAULT_SVC_MAX_SAME_XPRT	(8)
    256 
    257 int    svc_default_max_same_xprt = DEFAULT_SVC_MAX_SAME_XPRT;
    258 
    259 
    260 /*
    261  * Default `Redline' of non-detached threads.
    262  * Total number of detached and reserved threads in an RPC server
    263  * thread pool is limited to pool->p_maxthreads - svc_redline.
    264  */
    265 #define	DEFAULT_SVC_REDLINE		(1)
    266 
    267 int    svc_default_redline = DEFAULT_SVC_REDLINE;
    268 
    269 /*
    270  * A node for the `xprt-ready' queue.
    271  * See below.
    272  */
    273 struct __svcxprt_qnode {
    274 	__SVCXPRT_QNODE	*q_next;
    275 	SVCMASTERXPRT	*q_xprt;
    276 };
    277 
    278 /*
    279  * Global SVC variables (private).
    280  */
    281 struct svc_globals {
    282 	SVCPOOL		*svc_pools;
    283 	kmutex_t	svc_plock;
    284 };
    285 
    286 /*
    287  * Debug variable to check for rdma based
    288  * transport startup and cleanup. Contorlled
    289  * through /etc/system. Off by default.
    290  */
    291 int rdma_check = 0;
    292 
    293 /*
    294  * Authentication parameters list.
    295  */
    296 static caddr_t rqcred_head;
    297 static kmutex_t rqcred_lock;
    298 
    299 /*
    300  * Pointers to transport specific `rele' routines in rpcmod (set from rpcmod).
    301  */
    302 void	(*rpc_rele)(queue_t *, mblk_t *) = NULL;
    303 void	(*mir_rele)(queue_t *, mblk_t *) = NULL;
    304 
    305 /* ARGSUSED */
    306 void
    307 rpc_rdma_rele(queue_t *q, mblk_t *mp)
    308 {
    309 }
    310 void    (*rdma_rele)(queue_t *, mblk_t *) = rpc_rdma_rele;
    311 
    312 
    313 /*
    314  * This macro picks which `rele' routine to use, based on the transport type.
    315  */
    316 #define	RELE_PROC(xprt) \
    317 	((xprt)->xp_type == T_RDMA ? rdma_rele : \
    318 	(((xprt)->xp_type == T_CLTS) ? rpc_rele : mir_rele))
    319 
    320 /*
    321  * If true, then keep quiet about version mismatch.
    322  * This macro is for broadcast RPC only. We have no broadcast RPC in
    323  * kernel now but one may define a flag in the transport structure
    324  * and redefine this macro.
    325  */
    326 #define	version_keepquiet(xprt)	(FALSE)
    327 
    328 /*
    329  * ZSD key used to retrieve zone-specific svc globals
    330  */
    331 static zone_key_t svc_zone_key;
    332 
    333 static void svc_callout_free(SVCMASTERXPRT *);
    334 static void svc_xprt_qinit(SVCPOOL *, size_t);
    335 static void svc_xprt_qdestroy(SVCPOOL *);
    336 static void svc_thread_creator(SVCPOOL *);
    337 static void svc_creator_signal(SVCPOOL *);
    338 static void svc_creator_signalexit(SVCPOOL *);
    339 static void svc_pool_unregister(struct svc_globals *, SVCPOOL *);
    340 static int svc_run(SVCPOOL *);
    341 
    342 /* ARGSUSED */
    343 static void *
    344 svc_zoneinit(zoneid_t zoneid)
    345 {
    346 	struct svc_globals *svc;
    347 
    348 	svc = kmem_alloc(sizeof (*svc), KM_SLEEP);
    349 	mutex_init(&svc->svc_plock, NULL, MUTEX_DEFAULT, NULL);
    350 	svc->svc_pools = NULL;
    351 	return (svc);
    352 }
    353 
    354 /* ARGSUSED */
    355 static void
    356 svc_zoneshutdown(zoneid_t zoneid, void *arg)
    357 {
    358 	struct svc_globals *svc = arg;
    359 	SVCPOOL *pool;
    360 
    361 	mutex_enter(&svc->svc_plock);
    362 	while ((pool = svc->svc_pools) != NULL) {
    363 		svc_pool_unregister(svc, pool);
    364 	}
    365 	mutex_exit(&svc->svc_plock);
    366 }
    367 
    368 /* ARGSUSED */
    369 static void
    370 svc_zonefini(zoneid_t zoneid, void *arg)
    371 {
    372 	struct svc_globals *svc = arg;
    373 
    374 	ASSERT(svc->svc_pools == NULL);
    375 	mutex_destroy(&svc->svc_plock);
    376 	kmem_free(svc, sizeof (*svc));
    377 }
    378 
    379 /*
    380  * Global SVC init routine.
    381  * Initialize global generic and transport type specific structures
    382  * used by the kernel RPC server side. This routine is called only
    383  * once when the module is being loaded.
    384  */
    385 void
    386 svc_init()
    387 {
    388 	zone_key_create(&svc_zone_key, svc_zoneinit, svc_zoneshutdown,
    389 	    svc_zonefini);
    390 	svc_cots_init();
    391 	svc_clts_init();
    392 }
    393 
    394 /*
    395  * Destroy the SVCPOOL structure.
    396  */
    397 static void
    398 svc_pool_cleanup(SVCPOOL *pool)
    399 {
    400 	ASSERT(pool->p_threads + pool->p_detached_threads == 0);
    401 	ASSERT(pool->p_lcount == 0);
    402 	ASSERT(pool->p_closing);
    403 
    404 	/*
    405 	 * Call the user supplied shutdown function.  This is done
    406 	 * here so the user of the pool will be able to cleanup
    407 	 * service related resources.
    408 	 */
    409 	if (pool->p_shutdown != NULL)
    410 		(pool->p_shutdown)();
    411 
    412 	/* Destroy `xprt-ready' queue */
    413 	svc_xprt_qdestroy(pool);
    414 
    415 	/* Destroy transport list */
    416 	rw_destroy(&pool->p_lrwlock);
    417 
    418 	/* Destroy locks and condition variables */
    419 	mutex_destroy(&pool->p_thread_lock);
    420 	mutex_destroy(&pool->p_req_lock);
    421 	cv_destroy(&pool->p_req_cv);
    422 
    423 	/* Destroy creator's locks and condition variables */
    424 	mutex_destroy(&pool->p_creator_lock);
    425 	cv_destroy(&pool->p_creator_cv);
    426 	mutex_destroy(&pool->p_user_lock);
    427 	cv_destroy(&pool->p_user_cv);
    428 
    429 	/* Free pool structure */
    430 	kmem_free(pool, sizeof (SVCPOOL));
    431 }
    432 
    433 /*
    434  * If all the transports and service threads are already gone
    435  * signal the creator thread to clean up and exit.
    436  */
    437 static bool_t
    438 svc_pool_tryexit(SVCPOOL *pool)
    439 {
    440 	ASSERT(MUTEX_HELD(&pool->p_thread_lock));
    441 	ASSERT(pool->p_closing);
    442 
    443 	if (pool->p_threads + pool->p_detached_threads == 0) {
    444 		rw_enter(&pool->p_lrwlock, RW_READER);
    445 		if (pool->p_lcount == 0) {
    446 			/*
    447 			 * Release the locks before sending a signal.
    448 			 */
    449 			rw_exit(&pool->p_lrwlock);
    450 			mutex_exit(&pool->p_thread_lock);
    451 
    452 			/*
    453 			 * Notify the creator thread to clean up and exit
    454 			 *
    455 			 * NOTICE: No references to the pool beyond this point!
    456 			 *		   The pool is being destroyed.
    457 			 */
    458 			ASSERT(!MUTEX_HELD(&pool->p_thread_lock));
    459 			svc_creator_signalexit(pool);
    460 
    461 			return (TRUE);
    462 		}
    463 		rw_exit(&pool->p_lrwlock);
    464 	}
    465 
    466 	ASSERT(MUTEX_HELD(&pool->p_thread_lock));
    467 	return (FALSE);
    468 }
    469 
    470 /*
    471  * Find a pool with a given id.
    472  */
    473 static SVCPOOL *
    474 svc_pool_find(struct svc_globals *svc, int id)
    475 {
    476 	SVCPOOL *pool;
    477 
    478 	ASSERT(MUTEX_HELD(&svc->svc_plock));
    479 
    480 	/*
    481 	 * Search the list for a pool with a matching id
    482 	 * and register the transport handle with that pool.
    483 	 */
    484 	for (pool = svc->svc_pools; pool; pool = pool->p_next)
    485 		if (pool->p_id == id)
    486 			return (pool);
    487 
    488 	return (NULL);
    489 }
    490 
    491 /*
    492  * PSARC 2003/523 Contract Private Interface
    493  * svc_do_run
    494  * Changes must be reviewed by Solaris File Sharing
    495  * Changes must be communicated to contract-2003-523 (at) sun.com
    496  */
    497 int
    498 svc_do_run(int id)
    499 {
    500 	SVCPOOL *pool;
    501 	int err = 0;
    502 	struct svc_globals *svc;
    503 
    504 	svc = zone_getspecific(svc_zone_key, curproc->p_zone);
    505 	mutex_enter(&svc->svc_plock);
    506 
    507 	pool = svc_pool_find(svc, id);
    508 
    509 	mutex_exit(&svc->svc_plock);
    510 
    511 	if (pool == NULL)
    512 		return (ENOENT);
    513 
    514 	/*
    515 	 * Increment counter of pool threads now
    516 	 * that a thread has been created.
    517 	 */
    518 	mutex_enter(&pool->p_thread_lock);
    519 	pool->p_threads++;
    520 	mutex_exit(&pool->p_thread_lock);
    521 
    522 	/* Give work to the new thread. */
    523 	err = svc_run(pool);
    524 
    525 	return (err);
    526 }
    527 
    528 /*
    529  * Unregister a pool from the pool list.
    530  * Set the closing state. If all the transports and service threads
    531  * are already gone signal the creator thread to clean up and exit.
    532  */
    533 static void
    534 svc_pool_unregister(struct svc_globals *svc, SVCPOOL *pool)
    535 {
    536 	SVCPOOL *next = pool->p_next;
    537 	SVCPOOL *prev = pool->p_prev;
    538 
    539 	ASSERT(MUTEX_HELD(&svc->svc_plock));
    540 
    541 	/* Remove from the list */
    542 	if (pool == svc->svc_pools)
    543 		svc->svc_pools = next;
    544 	if (next)
    545 		next->p_prev = prev;
    546 	if (prev)
    547 		prev->p_next = next;
    548 	pool->p_next = pool->p_prev = NULL;
    549 
    550 	/*
    551 	 * Offline the pool. Mark the pool as closing.
    552 	 * If there are no transports in this pool notify
    553 	 * the creator thread to clean it up and exit.
    554 	 */
    555 	mutex_enter(&pool->p_thread_lock);
    556 	if (pool->p_offline != NULL)
    557 		(pool->p_offline)();
    558 	pool->p_closing = TRUE;
    559 	if (svc_pool_tryexit(pool))
    560 		return;
    561 	mutex_exit(&pool->p_thread_lock);
    562 }
    563 
    564 /*
    565  * Register a pool with a given id in the global doubly linked pool list.
    566  * - if there is a pool with the same id in the list then unregister it
    567  * - insert the new pool into the list.
    568  */
    569 static void
    570 svc_pool_register(struct svc_globals *svc, SVCPOOL *pool, int id)
    571 {
    572 	SVCPOOL *old_pool;
    573 
    574 	/*
    575 	 * If there is a pool with the same id then remove it from
    576 	 * the list and mark the pool as closing.
    577 	 */
    578 	mutex_enter(&svc->svc_plock);
    579 
    580 	if (old_pool = svc_pool_find(svc, id))
    581 		svc_pool_unregister(svc, old_pool);
    582 
    583 	/* Insert into the doubly linked list */
    584 	pool->p_id = id;
    585 	pool->p_next = svc->svc_pools;
    586 	pool->p_prev = NULL;
    587 	if (svc->svc_pools)
    588 		svc->svc_pools->p_prev = pool;
    589 	svc->svc_pools = pool;
    590 
    591 	mutex_exit(&svc->svc_plock);
    592 }
    593 
    594 /*
    595  * Initialize a newly created pool structure
    596  */
    597 static int
    598 svc_pool_init(SVCPOOL *pool, uint_t maxthreads, uint_t redline,
    599 	uint_t qsize, uint_t timeout, uint_t stksize, uint_t max_same_xprt)
    600 {
    601 	klwp_t *lwp = ttolwp(curthread);
    602 
    603 	ASSERT(pool);
    604 
    605 	if (maxthreads == 0)
    606 		maxthreads = svc_default_maxthreads;
    607 	if (redline == 0)
    608 		redline = svc_default_redline;
    609 	if (qsize == 0)
    610 		qsize = svc_default_qsize;
    611 	if (timeout == 0)
    612 		timeout = svc_default_timeout;
    613 	if (stksize == 0)
    614 		stksize = svc_default_stksize;
    615 	if (max_same_xprt == 0)
    616 		max_same_xprt = svc_default_max_same_xprt;
    617 
    618 	if (maxthreads < redline)
    619 		return (EINVAL);
    620 
    621 	/* Allocate and initialize the `xprt-ready' queue */
    622 	svc_xprt_qinit(pool, qsize);
    623 
    624 	/* Initialize doubly-linked xprt list */
    625 	rw_init(&pool->p_lrwlock, NULL, RW_DEFAULT, NULL);
    626 
    627 	/*
    628 	 * Setting lwp_childstksz on the current lwp so that
    629 	 * descendants of this lwp get the modified stacksize, if
    630 	 * it is defined. It is important that either this lwp or
    631 	 * one of its descendants do the actual servicepool thread
    632 	 * creation to maintain the stacksize inheritance.
    633 	 */
    634 	if (lwp != NULL)
    635 		lwp->lwp_childstksz = stksize;
    636 
    637 	/* Initialize thread limits, locks and condition variables */
    638 	pool->p_maxthreads = maxthreads;
    639 	pool->p_redline = redline;
    640 	pool->p_timeout = timeout * hz;
    641 	pool->p_stksize = stksize;
    642 	pool->p_max_same_xprt = max_same_xprt;
    643 	mutex_init(&pool->p_thread_lock, NULL, MUTEX_DEFAULT, NULL);
    644 	mutex_init(&pool->p_req_lock, NULL, MUTEX_DEFAULT, NULL);
    645 	cv_init(&pool->p_req_cv, NULL, CV_DEFAULT, NULL);
    646 
    647 	/* Initialize userland creator */
    648 	pool->p_user_exit = FALSE;
    649 	pool->p_signal_create_thread = FALSE;
    650 	pool->p_user_waiting = FALSE;
    651 	mutex_init(&pool->p_user_lock, NULL, MUTEX_DEFAULT, NULL);
    652 	cv_init(&pool->p_user_cv, NULL, CV_DEFAULT, NULL);
    653 
    654 	/* Initialize the creator and start the creator thread */
    655 	pool->p_creator_exit = FALSE;
    656 	mutex_init(&pool->p_creator_lock, NULL, MUTEX_DEFAULT, NULL);
    657 	cv_init(&pool->p_creator_cv, NULL, CV_DEFAULT, NULL);
    658 
    659 	(void) zthread_create(NULL, pool->p_stksize, svc_thread_creator,
    660 	    pool, 0, minclsyspri);
    661 
    662 	return (0);
    663 }
    664 
    665 /*
    666  * PSARC 2003/523 Contract Private Interface
    667  * svc_pool_create
    668  * Changes must be reviewed by Solaris File Sharing
    669  * Changes must be communicated to contract-2003-523 (at) sun.com
    670  *
    671  * Create an kernel RPC server-side thread/transport pool.
    672  *
    673  * This is public interface for creation of a server RPC thread pool
    674  * for a given service provider. Transports registered with the pool's id
    675  * will be served by a pool's threads. This function is called from the
    676  * nfssys() system call.
    677  */
    678 int
    679 svc_pool_create(struct svcpool_args *args)
    680 {
    681 	SVCPOOL *pool;
    682 	int error;
    683 	struct svc_globals *svc;
    684 
    685 	/*
    686 	 * Caller should check credentials in a way appropriate
    687 	 * in the context of the call.
    688 	 */
    689 
    690 	svc = zone_getspecific(svc_zone_key, curproc->p_zone);
    691 	/* Allocate a new pool */
    692 	pool = kmem_zalloc(sizeof (SVCPOOL), KM_SLEEP);
    693 
    694 	/*
    695 	 * Initialize the pool structure and create a creator thread.
    696 	 */
    697 	error = svc_pool_init(pool, args->maxthreads, args->redline,
    698 	    args->qsize, args->timeout, args->stksize, args->max_same_xprt);
    699 
    700 	if (error) {
    701 		kmem_free(pool, sizeof (SVCPOOL));
    702 		return (error);
    703 	}
    704 
    705 	/* Register the pool with the global pool list */
    706 	svc_pool_register(svc, pool, args->id);
    707 
    708 	return (0);
    709 }
    710 
    711 int
    712 svc_pool_control(int id, int cmd, void *arg)
    713 {
    714 	SVCPOOL *pool;
    715 	struct svc_globals *svc;
    716 
    717 	svc = zone_getspecific(svc_zone_key, curproc->p_zone);
    718 
    719 	switch (cmd) {
    720 	case SVCPSET_SHUTDOWN_PROC:
    721 		/*
    722 		 * Search the list for a pool with a matching id
    723 		 * and register the transport handle with that pool.
    724 		 */
    725 		mutex_enter(&svc->svc_plock);
    726 
    727 		if ((pool = svc_pool_find(svc, id)) == NULL) {
    728 			mutex_exit(&svc->svc_plock);
    729 			return (ENOENT);
    730 		}
    731 		/*
    732 		 * Grab the transport list lock before releasing the
    733 		 * pool list lock
    734 		 */
    735 		rw_enter(&pool->p_lrwlock, RW_WRITER);
    736 		mutex_exit(&svc->svc_plock);
    737 
    738 		pool->p_shutdown = *((void (*)())arg);
    739 
    740 		rw_exit(&pool->p_lrwlock);
    741 
    742 		return (0);
    743 	case SVCPSET_UNREGISTER_PROC:
    744 		/*
    745 		 * Search the list for a pool with a matching id
    746 		 * and register the unregister callback handle with that pool.
    747 		 */
    748 		mutex_enter(&svc->svc_plock);
    749 
    750 		if ((pool = svc_pool_find(svc, id)) == NULL) {
    751 			mutex_exit(&svc->svc_plock);
    752 			return (ENOENT);
    753 		}
    754 		/*
    755 		 * Grab the transport list lock before releasing the
    756 		 * pool list lock
    757 		 */
    758 		rw_enter(&pool->p_lrwlock, RW_WRITER);
    759 		mutex_exit(&svc->svc_plock);
    760 
    761 		pool->p_offline = *((void (*)())arg);
    762 
    763 		rw_exit(&pool->p_lrwlock);
    764 
    765 		return (0);
    766 	default:
    767 		return (EINVAL);
    768 	}
    769 }
    770 
    771 /*
    772  * Pool's transport list manipulation routines.
    773  * - svc_xprt_register()
    774  * - svc_xprt_unregister()
    775  *
    776  * svc_xprt_register() is called from svc_tli_kcreate() to
    777  * insert a new master transport handle into the doubly linked
    778  * list of server transport handles (one list per pool).
    779  *
    780  * The list is used by svc_poll(), when it operates in `drain'
    781  * mode, to search for a next transport with a pending request.
    782  */
    783 
    784 int
    785 svc_xprt_register(SVCMASTERXPRT *xprt, int id)
    786 {
    787 	SVCMASTERXPRT *prev, *next;
    788 	SVCPOOL *pool;
    789 	struct svc_globals *svc;
    790 
    791 	svc = zone_getspecific(svc_zone_key, curproc->p_zone);
    792 	/*
    793 	 * Search the list for a pool with a matching id
    794 	 * and register the transport handle with that pool.
    795 	 */
    796 	mutex_enter(&svc->svc_plock);
    797 
    798 	if ((pool = svc_pool_find(svc, id)) == NULL) {
    799 		mutex_exit(&svc->svc_plock);
    800 		return (ENOENT);
    801 	}
    802 
    803 	/* Grab the transport list lock before releasing the pool list lock */
    804 	rw_enter(&pool->p_lrwlock, RW_WRITER);
    805 	mutex_exit(&svc->svc_plock);
    806 
    807 	/* Don't register new transports when the pool is in closing state */
    808 	if (pool->p_closing) {
    809 		rw_exit(&pool->p_lrwlock);
    810 		return (EBUSY);
    811 	}
    812 
    813 	/*
    814 	 * Initialize xp_pool to point to the pool.
    815 	 * We don't want to go through the pool list every time.
    816 	 */
    817 	xprt->xp_pool = pool;
    818 
    819 	/*
    820 	 * Insert a transport handle into the list.
    821 	 * The list head points to the most recently inserted transport.
    822 	 */
    823 	if (pool->p_lhead == NULL)
    824 		pool->p_lhead = xprt->xp_prev = xprt->xp_next = xprt;
    825 	else {
    826 		next = pool->p_lhead;
    827 		prev = pool->p_lhead->xp_prev;
    828 
    829 		xprt->xp_next = next;
    830 		xprt->xp_prev = prev;
    831 
    832 		pool->p_lhead = prev->xp_next = next->xp_prev = xprt;
    833 	}
    834 
    835 	/* Increment the transports count */
    836 	pool->p_lcount++;
    837 
    838 	rw_exit(&pool->p_lrwlock);
    839 	return (0);
    840 }
    841 
    842 /*
    843  * Called from svc_xprt_cleanup() to remove a master transport handle
    844  * from the pool's list of server transports (when a transport is
    845  * being destroyed).
    846  */
    847 void
    848 svc_xprt_unregister(SVCMASTERXPRT *xprt)
    849 {
    850 	SVCPOOL *pool = xprt->xp_pool;
    851 
    852 	/*
    853 	 * Unlink xprt from the list.
    854 	 * If the list head points to this xprt then move it
    855 	 * to the next xprt or reset to NULL if this is the last
    856 	 * xprt in the list.
    857 	 */
    858 	rw_enter(&pool->p_lrwlock, RW_WRITER);
    859 
    860 	if (xprt == xprt->xp_next)
    861 		pool->p_lhead = NULL;
    862 	else {
    863 		SVCMASTERXPRT *next = xprt->xp_next;
    864 		SVCMASTERXPRT *prev = xprt->xp_prev;
    865 
    866 		next->xp_prev = prev;
    867 		prev->xp_next = next;
    868 
    869 		if (pool->p_lhead == xprt)
    870 			pool->p_lhead = next;
    871 	}
    872 
    873 	xprt->xp_next = xprt->xp_prev = NULL;
    874 
    875 	/* Decrement list count */
    876 	pool->p_lcount--;
    877 
    878 	rw_exit(&pool->p_lrwlock);
    879 }
    880 
    881 static void
    882 svc_xprt_qdestroy(SVCPOOL *pool)
    883 {
    884 	mutex_destroy(&pool->p_qend_lock);
    885 	kmem_free(pool->p_qbody, pool->p_qsize * sizeof (__SVCXPRT_QNODE));
    886 }
    887 
    888 /*
    889  * Initialize an `xprt-ready' queue for a given pool.
    890  */
    891 static void
    892 svc_xprt_qinit(SVCPOOL *pool, size_t qsize)
    893 {
    894 	int i;
    895 
    896 	pool->p_qsize = qsize;
    897 	pool->p_qbody = kmem_zalloc(pool->p_qsize * sizeof (__SVCXPRT_QNODE),
    898 	    KM_SLEEP);
    899 
    900 	for (i = 0; i < pool->p_qsize - 1; i++)
    901 		pool->p_qbody[i].q_next = &(pool->p_qbody[i+1]);
    902 
    903 	pool->p_qbody[pool->p_qsize-1].q_next = &(pool->p_qbody[0]);
    904 	pool->p_qtop = &(pool->p_qbody[0]);
    905 	pool->p_qend = &(pool->p_qbody[0]);
    906 
    907 	mutex_init(&pool->p_qend_lock, NULL, MUTEX_DEFAULT, NULL);
    908 }
    909 
    910 /*
    911  * Called from the svc_queuereq() interrupt routine to queue
    912  * a hint for svc_poll() which transport has a pending request.
    913  * - insert a pointer to xprt into the xprt-ready queue (FIFO)
    914  * - if the xprt-ready queue is full turn the overflow flag on.
    915  *
    916  * NOTICE: pool->p_qtop is protected by the the pool's request lock
    917  * and the caller (svc_queuereq()) must hold the lock.
    918  */
    919 static void
    920 svc_xprt_qput(SVCPOOL *pool, SVCMASTERXPRT *xprt)
    921 {
    922 	ASSERT(MUTEX_HELD(&pool->p_req_lock));
    923 
    924 	/* If the overflow flag is there is nothing we can do */
    925 	if (pool->p_qoverflow)
    926 		return;
    927 
    928 	/* If the queue is full turn the overflow flag on and exit */
    929 	if (pool->p_qtop->q_next == pool->p_qend) {
    930 		mutex_enter(&pool->p_qend_lock);
    931 		if (pool->p_qtop->q_next == pool->p_qend) {
    932 			pool->p_qoverflow = TRUE;
    933 			mutex_exit(&pool->p_qend_lock);
    934 			return;
    935 		}
    936 		mutex_exit(&pool->p_qend_lock);
    937 	}
    938 
    939 	/* Insert a hint and move pool->p_qtop */
    940 	pool->p_qtop->q_xprt = xprt;
    941 	pool->p_qtop = pool->p_qtop->q_next;
    942 }
    943 
    944 /*
    945  * Called from svc_poll() to get a hint which transport has a
    946  * pending request. Returns a pointer to a transport or NULL if the
    947  * `xprt-ready' queue is empty.
    948  *
    949  * Since we do not acquire the pool's request lock while checking if
    950  * the queue is empty we may miss a request that is just being delivered.
    951  * However this is ok since svc_poll() will retry again until the
    952  * count indicates that there are pending requests for this pool.
    953  */
    954 static SVCMASTERXPRT *
    955 svc_xprt_qget(SVCPOOL *pool)
    956 {
    957 	SVCMASTERXPRT *xprt;
    958 
    959 	mutex_enter(&pool->p_qend_lock);
    960 	do {
    961 		/*
    962 		 * If the queue is empty return NULL.
    963 		 * Since we do not acquire the pool's request lock which
    964 		 * protects pool->p_qtop this is not exact check. However,
    965 		 * this is safe - if we miss a request here svc_poll()
    966 		 * will retry again.
    967 		 */
    968 		if (pool->p_qend == pool->p_qtop) {
    969 			mutex_exit(&pool->p_qend_lock);
    970 			return (NULL);
    971 		}
    972 
    973 		/* Get a hint and move pool->p_qend */
    974 		xprt = pool->p_qend->q_xprt;
    975 		pool->p_qend = pool->p_qend->q_next;
    976 
    977 		/* Skip fields deleted by svc_xprt_qdelete()	 */
    978 	} while (xprt == NULL);
    979 	mutex_exit(&pool->p_qend_lock);
    980 
    981 	return (xprt);
    982 }
    983 
    984 /*
    985  * Delete all the references to a transport handle that
    986  * is being destroyed from the xprt-ready queue.
    987  * Deleted pointers are replaced with NULLs.
    988  */
    989 static void
    990 svc_xprt_qdelete(SVCPOOL *pool, SVCMASTERXPRT *xprt)
    991 {
    992 	__SVCXPRT_QNODE *q = pool->p_qend;
    993 	__SVCXPRT_QNODE *qtop = pool->p_qtop;
    994 
    995 	/*
    996 	 * Delete all the references to xprt between the current
    997 	 * position of pool->p_qend and current pool->p_qtop.
    998 	 */
    999 	for (;;) {
   1000 		if (q->q_xprt == xprt)
   1001 			q->q_xprt = NULL;
   1002 		if (q == qtop)
   1003 			return;
   1004 		q = q->q_next;
   1005 	}
   1006 }
   1007 
   1008 /*
   1009  * Destructor for a master server transport handle.
   1010  * - if there are no more non-detached threads linked to this transport
   1011  *   then, if requested, call xp_closeproc (we don't wait for detached
   1012  *   threads linked to this transport to complete).
   1013  * - if there are no more threads linked to this
   1014  *   transport then
   1015  *   a) remove references to this transport from the xprt-ready queue
   1016  *   b) remove a reference to this transport from the pool's transport list
   1017  *   c) call a transport specific `destroy' function
   1018  *   d) cancel remaining thread reservations.
   1019  *
   1020  * NOTICE: Caller must hold the transport's thread lock.
   1021  */
   1022 static void
   1023 svc_xprt_cleanup(SVCMASTERXPRT *xprt, bool_t detached)
   1024 {
   1025 	ASSERT(MUTEX_HELD(&xprt->xp_thread_lock));
   1026 	ASSERT(xprt->xp_wq == NULL);
   1027 
   1028 	/*
   1029 	 * If called from the last non-detached thread
   1030 	 * it should call the closeproc on this transport.
   1031 	 */
   1032 	if (!detached && xprt->xp_threads == 0 && xprt->xp_closeproc) {
   1033 		(*(xprt->xp_closeproc)) (xprt);
   1034 	}
   1035 
   1036 	if (xprt->xp_threads + xprt->xp_detached_threads > 0)
   1037 		mutex_exit(&xprt->xp_thread_lock);
   1038 	else {
   1039 		/* Remove references to xprt from the `xprt-ready' queue */
   1040 		svc_xprt_qdelete(xprt->xp_pool, xprt);
   1041 
   1042 		/* Unregister xprt from the pool's transport list */
   1043 		svc_xprt_unregister(xprt);
   1044 		svc_callout_free(xprt);
   1045 		SVC_DESTROY(xprt);
   1046 	}
   1047 }
   1048 
   1049 /*
   1050  * Find a dispatch routine for a given prog/vers pair.
   1051  * This function is called from svc_getreq() to search the callout
   1052  * table for an entry with a matching RPC program number `prog'
   1053  * and a version range that covers `vers'.
   1054  * - if it finds a matching entry it returns pointer to the dispatch routine
   1055  * - otherwise it returns NULL and, if `minp' or `maxp' are not NULL,
   1056  *   fills them with, respectively, lowest version and highest version
   1057  *   supported for the program `prog'
   1058  */
   1059 static SVC_DISPATCH *
   1060 svc_callout_find(SVCXPRT *xprt, rpcprog_t prog, rpcvers_t vers,
   1061     rpcvers_t *vers_min, rpcvers_t *vers_max)
   1062 {
   1063 	SVC_CALLOUT_TABLE *sct = xprt->xp_sct;
   1064 	int i;
   1065 
   1066 	*vers_min = ~(rpcvers_t)0;
   1067 	*vers_max = 0;
   1068 
   1069 	for (i = 0; i < sct->sct_size; i++) {
   1070 		SVC_CALLOUT *sc = &sct->sct_sc[i];
   1071 
   1072 		if (prog == sc->sc_prog) {
   1073 			if (vers >= sc->sc_versmin && vers <= sc->sc_versmax)
   1074 				return (sc->sc_dispatch);
   1075 
   1076 			if (*vers_max < sc->sc_versmax)
   1077 				*vers_max = sc->sc_versmax;
   1078 			if (*vers_min > sc->sc_versmin)
   1079 				*vers_min = sc->sc_versmin;
   1080 		}
   1081 	}
   1082 
   1083 	return (NULL);
   1084 }
   1085 
   1086 /*
   1087  * Optionally free callout table allocated for this transport by
   1088  * the service provider.
   1089  */
   1090 static void
   1091 svc_callout_free(SVCMASTERXPRT *xprt)
   1092 {
   1093 	SVC_CALLOUT_TABLE *sct = xprt->xp_sct;
   1094 
   1095 	if (sct->sct_free) {
   1096 		kmem_free(sct->sct_sc, sct->sct_size * sizeof (SVC_CALLOUT));
   1097 		kmem_free(sct, sizeof (SVC_CALLOUT_TABLE));
   1098 	}
   1099 }
   1100 
   1101 /*
   1102  * Send a reply to an RPC request
   1103  *
   1104  * PSARC 2003/523 Contract Private Interface
   1105  * svc_sendreply
   1106  * Changes must be reviewed by Solaris File Sharing
   1107  * Changes must be communicated to contract-2003-523 (at) sun.com
   1108  */
   1109 bool_t
   1110 svc_sendreply(const SVCXPRT *clone_xprt, const xdrproc_t xdr_results,
   1111     const caddr_t xdr_location)
   1112 {
   1113 	struct rpc_msg rply;
   1114 
   1115 	rply.rm_direction = REPLY;
   1116 	rply.rm_reply.rp_stat = MSG_ACCEPTED;
   1117 	rply.acpted_rply.ar_verf = clone_xprt->xp_verf;
   1118 	rply.acpted_rply.ar_stat = SUCCESS;
   1119 	rply.acpted_rply.ar_results.where = xdr_location;
   1120 	rply.acpted_rply.ar_results.proc = xdr_results;
   1121 
   1122 	return (SVC_REPLY((SVCXPRT *)clone_xprt, &rply));
   1123 }
   1124 
   1125 /*
   1126  * No procedure error reply
   1127  *
   1128  * PSARC 2003/523 Contract Private Interface
   1129  * svcerr_noproc
   1130  * Changes must be reviewed by Solaris File Sharing
   1131  * Changes must be communicated to contract-2003-523 (at) sun.com
   1132  */
   1133 void
   1134 svcerr_noproc(const SVCXPRT *clone_xprt)
   1135 {
   1136 	struct rpc_msg rply;
   1137 
   1138 	rply.rm_direction = REPLY;
   1139 	rply.rm_reply.rp_stat = MSG_ACCEPTED;
   1140 	rply.acpted_rply.ar_verf = clone_xprt->xp_verf;
   1141 	rply.acpted_rply.ar_stat = PROC_UNAVAIL;
   1142 	SVC_FREERES((SVCXPRT *)clone_xprt);
   1143 	SVC_REPLY((SVCXPRT *)clone_xprt, &rply);
   1144 }
   1145 
   1146 /*
   1147  * Can't decode arguments error reply
   1148  *
   1149  * PSARC 2003/523 Contract Private Interface
   1150  * svcerr_decode
   1151  * Changes must be reviewed by Solaris File Sharing
   1152  * Changes must be communicated to contract-2003-523 (at) sun.com
   1153  */
   1154 void
   1155 svcerr_decode(const SVCXPRT *clone_xprt)
   1156 {
   1157 	struct rpc_msg rply;
   1158 
   1159 	rply.rm_direction = REPLY;
   1160 	rply.rm_reply.rp_stat = MSG_ACCEPTED;
   1161 	rply.acpted_rply.ar_verf = clone_xprt->xp_verf;
   1162 	rply.acpted_rply.ar_stat = GARBAGE_ARGS;
   1163 	SVC_FREERES((SVCXPRT *)clone_xprt);
   1164 	SVC_REPLY((SVCXPRT *)clone_xprt, &rply);
   1165 }
   1166 
   1167 /*
   1168  * Some system error
   1169  */
   1170 void
   1171 svcerr_systemerr(const SVCXPRT *clone_xprt)
   1172 {
   1173 	struct rpc_msg rply;
   1174 
   1175 	rply.rm_direction = REPLY;
   1176 	rply.rm_reply.rp_stat = MSG_ACCEPTED;
   1177 	rply.acpted_rply.ar_verf = clone_xprt->xp_verf;
   1178 	rply.acpted_rply.ar_stat = SYSTEM_ERR;
   1179 	SVC_FREERES((SVCXPRT *)clone_xprt);
   1180 	SVC_REPLY((