Home | History | Annotate | Download | only in rpc
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T
     28  *		All Rights Reserved
     29  */
     30 
     31 /*
     32  * Portions of this source code were derived from Berkeley 4.3 BSD
     33  * under license from the Regents of the University of California.
     34  */
     35 
     36 #pragma ident	"@(#)clnt_cots.c	1.125	07/06/04 SMI"
     37 
     38 /*
     39  * Implements a kernel based, client side RPC over Connection Oriented
     40  * Transports (COTS).
     41  */
     42 
     43 /*
     44  * Much of this file has been re-written to let NFS work better over slow
     45  * transports. A description follows.
     46  *
     47  * One of the annoying things about kRPC/COTS is that it will temporarily
     48  * create more than one connection between a client and server. This
     49  * happens because when a connection is made, the end-points entry in the
     50  * linked list of connections (headed by cm_hd), is removed so that other
     51  * threads don't mess with it. Went ahead and bit the bullet by keeping
     52  * the endpoint on the connection list and introducing state bits,
     53  * condition variables etc. to the connection entry data structure (struct
     54  * cm_xprt).
     55  *
     56  * Here is a summary of the changes to cm-xprt:
     57  *
     58  *	x_ctime is the timestamp of when the endpoint was last
     59  *	connected or disconnected. If an end-point is ever disconnected
     60  *	or re-connected, then any outstanding RPC request is presumed
     61  *	lost, telling clnt_cots_kcallit that it needs to re-send the
     62  *	request, not just wait for the original request's reply to
     63  *	arrive.
     64  *
     65  *	x_thread flag which tells us if a thread is doing a connection attempt.
     66  *
     67  *	x_waitdis flag which tells us we are waiting a disconnect ACK.
     68  *
     69  *	x_needdis flag which tells us we need to send a T_DISCONN_REQ
     70  *	to kill the connection.
     71  *
     72  *	x_needrel flag which tells us we need to send a T_ORDREL_REQ to
     73  *	gracefully close the connection.
     74  *
     75  *	#defined bitmasks for the all the b_* bits so that more
     76  *	efficient (and at times less clumsy) masks can be used to
     77  *	manipulated state in cases where multiple bits have to
     78  *	set/cleared/checked in the same critical section.
     79  *
     80  *	x_conn_cv and x_dis-_cv are new condition variables to let
     81  *	threads knows when the connection attempt is done, and to let
     82  *	the connecting thread know when the disconnect handshake is
     83  *	done.
     84  *
     85  * Added the CONN_HOLD() macro so that all reference holds have the same
     86  * look and feel.
     87  *
     88  * In the private (cku_private) portion of the client handle,
     89  *
     90  *	cku_flags replaces the cku_sent a boolean. cku_flags keeps
     91  *	track of whether a request as been sent, and whether the
     92  *	client's handles call record is on the dispatch list (so that
     93  *	the reply can be matched by XID to the right client handle).
     94  *	The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit()
     95  *	and still have the response find the right client handle so
     96  *	that the retry of CLNT_CALL() gets the result. Testing, found
     97  *	situations where if the timeout was increased, performance
     98  *	degraded. This was due to us hitting a window where the thread
     99  *	was back in rfscall() (probably printing server not responding)
    100  *	while the response came back but no place to put it.
    101  *
    102  *	cku_ctime is just a cache of x_ctime. If they match,
    103  *	clnt_cots_kcallit() won't to send a retry (unless the maximum
    104  *	receive count limit as been reached). If the don't match, then
    105  *	we assume the request has been lost, and a retry of the request
    106  *	is needed.
    107  *
    108  *	cku_recv_attempts counts the number of receive count attempts
    109  *	after one try is sent on the wire.
    110  *
    111  * Added the clnt_delay() routine so that interruptible and
    112  * noninterruptible delays are possible.
    113  *
    114  * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to
    115  * control how long the client delays before returned after getting
    116  * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash
    117  * a server that may be booting and not yet started nfsd.
    118  *
    119  * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable)
    120  * Why don't we just wait forever (receive an infinite # of times)?
    121  * Because the server may have rebooted. More insidious is that some
    122  * servers (ours) will drop NFS/TCP requests in some cases. This is bad,
    123  * but it is a reality.
    124  *
    125  * The case of a server doing orderly release really messes up the
    126  * client's recovery, especially if the server's TCP implementation is
    127  * buggy.  It was found was that the kRPC/COTS client was breaking some
    128  * TPI rules, such as not waiting for the acknowledgement of a
    129  * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and
    130  * T_DISCON_REQ in clnt_dispatch_notifyall()).
    131  *
    132  * One of things that we've seen is that a kRPC TCP endpoint goes into
    133  * TIMEWAIT and a thus a reconnect takes a long time to satisfy because
    134  * that the TIMEWAIT state takes a while to finish.  If a server sends a
    135  * T_ORDREL_IND, there is little point in an RPC client doing a
    136  * T_ORDREL_REQ, because the RPC request isn't going to make it (the
    137  * server is saying that it won't accept any more data). So kRPC was
    138  * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the
    139  * connection skips the TIMEWAIT state and goes straight to a bound state
    140  * that kRPC can quickly switch to connected.
    141  *
    142  * Code that issues TPI request must use waitforack() to wait for the
    143  * corresponding ack (assuming there is one) in any future modifications.
    144  * This works around problems that may be introduced by breaking TPI rules
    145  * (by submitting new calls before earlier requests have been acked) in the
    146  * case of a signal or other early return.  waitforack() depends on
    147  * clnt_dispatch_notifyconn() to issue the wakeup when the ack
    148  * arrives, so adding new TPI calls may require corresponding changes
    149  * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on
    150  * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure
    151  * not to set it too low or TPI ACKS will be lost.
    152  */
    153 
    154 #include <sys/param.h>
    155 #include <sys/types.h>
    156 #include <sys/user.h>
    157 #include <sys/systm.h>
    158 #include <sys/sysmacros.h>
    159 #include <sys/proc.h>
    160 #include <sys/socket.h>
    161 #include <sys/file.h>
    162 #include <sys/stream.h>
    163 #include <sys/strsubr.h>
    164 #include <sys/stropts.h>
    165 #include <sys/strsun.h>
    166 #include <sys/timod.h>
    167 #include <sys/tiuser.h>
    168 #include <sys/tihdr.h>
    169 #include <sys/t_kuser.h>
    170 #include <sys/fcntl.h>
    171 #include <sys/errno.h>
    172 #include <sys/kmem.h>
    173 #include <sys/debug.h>
    174 #include <sys/systm.h>
    175 #include <sys/kstat.h>
    176 #include <sys/t_lock.h>
    177 #include <sys/ddi.h>
    178 #include <sys/cmn_err.h>
    179 #include <sys/time.h>
    180 #include <sys/isa_defs.h>
    181 #include <sys/callb.h>
    182 #include <sys/sunddi.h>
    183 #include <sys/atomic.h>
    184 
    185 #include <netinet/in.h>
    186 #include <netinet/tcp.h>
    187 
    188 #include <rpc/types.h>
    189 #include <rpc/xdr.h>
    190 #include <rpc/auth.h>
    191 #include <rpc/clnt.h>
    192 #include <rpc/rpc_msg.h>
    193 
    194 #define	COTS_DEFAULT_ALLOCSIZE	2048
    195 
    196 #define	WIRE_HDR_SIZE	20	/* serialized call header, sans proc number */
    197 #define	MSG_OFFSET	128	/* offset of call into the mblk */
    198 
    199 const char *kinet_ntop6(uchar_t *, char *, size_t);
    200 
    201 static int	clnt_cots_ksettimers(CLIENT *, struct rpc_timers *,
    202     struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t);
    203 static enum clnt_stat	clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
    204     caddr_t, xdrproc_t, caddr_t, struct timeval);
    205 static void	clnt_cots_kabort(CLIENT *);
    206 static void	clnt_cots_kerror(CLIENT *, struct rpc_err *);
    207 static bool_t	clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t);
    208 static void	clnt_cots_kdestroy(CLIENT *);
    209 static bool_t	clnt_cots_kcontrol(CLIENT *, int, char *);
    210 
    211 
    212 /* List of transports managed by the connection manager. */
    213 struct cm_xprt {
    214 	TIUSER		*x_tiptr;	/* transport handle */
    215 	queue_t		*x_wq;		/* send queue */
    216 	clock_t		x_time;		/* last time we handed this xprt out */
    217 	clock_t		x_ctime;	/* time we went to CONNECTED */
    218 	int		x_tidu_size;    /* TIDU size of this transport */
    219 	union {
    220 	    struct {
    221 		unsigned int
    222 #ifdef	_BIT_FIELDS_HTOL
    223 		b_closing:	1,	/* we've sent a ord rel on this conn */
    224 		b_dead:		1,	/* transport is closed or disconn */
    225 		b_doomed:	1,	/* too many conns, let this go idle */
    226 		b_connected:	1,	/* this connection is connected */
    227 
    228 		b_ordrel:	1,	/* do an orderly release? */
    229 		b_thread:	1,	/* thread doing connect */
    230 		b_waitdis:	1,	/* waiting for disconnect ACK */
    231 		b_needdis:	1,	/* need T_DISCON_REQ */
    232 
    233 		b_needrel:	1,	/* need T_ORDREL_REQ */
    234 		b_early_disc:	1,	/* got a T_ORDREL_IND or T_DISCON_IND */
    235 					/* disconnect during connect */
    236 
    237 		b_pad:		22;
    238 
    239 #endif
    240 
    241 #ifdef	_BIT_FIELDS_LTOH
    242 		b_pad:		22,
    243 
    244 		b_early_disc:	1,	/* got a T_ORDREL_IND or T_DISCON_IND */
    245 					/* disconnect during connect */
    246 		b_needrel:	1,	/* need T_ORDREL_REQ */
    247 
    248 		b_needdis:	1,	/* need T_DISCON_REQ */
    249 		b_waitdis:	1,	/* waiting for disconnect ACK */
    250 		b_thread:	1,	/* thread doing connect */
    251 		b_ordrel:	1,	/* do an orderly release? */
    252 
    253 		b_connected:	1,	/* this connection is connected */
    254 		b_doomed:	1,	/* too many conns, let this go idle */
    255 		b_dead:		1,	/* transport is closed or disconn */
    256 		b_closing:	1;	/* we've sent a ord rel on this conn */
    257 #endif
    258 	    } bit;	    unsigned int word;
    259 
    260 #define	x_closing	x_state.bit.b_closing
    261 #define	x_dead		x_state.bit.b_dead
    262 #define	x_doomed	x_state.bit.b_doomed
    263 #define	x_connected	x_state.bit.b_connected
    264 
    265 #define	x_ordrel	x_state.bit.b_ordrel
    266 #define	x_thread	x_state.bit.b_thread
    267 #define	x_waitdis	x_state.bit.b_waitdis
    268 #define	x_needdis	x_state.bit.b_needdis
    269 
    270 #define	x_needrel	x_state.bit.b_needrel
    271 #define	x_early_disc    x_state.bit.b_early_disc
    272 
    273 #define	x_state_flags	x_state.word
    274 
    275 #define	X_CLOSING	0x80000000
    276 #define	X_DEAD		0x40000000
    277 #define	X_DOOMED	0x20000000
    278 #define	X_CONNECTED	0x10000000
    279 
    280 #define	X_ORDREL	0x08000000
    281 #define	X_THREAD	0x04000000
    282 #define	X_WAITDIS	0x02000000
    283 #define	X_NEEDDIS	0x01000000
    284 
    285 #define	X_NEEDREL	0x00800000
    286 #define	X_EARLYDISC	0x00400000
    287 
    288 #define	X_BADSTATES	(X_CLOSING | X_DEAD | X_DOOMED)
    289 
    290 	}		x_state;
    291 	int		x_ref;		/* number of users of this xprt */
    292 	int		x_family;	/* address family of transport */
    293 	dev_t		x_rdev;		/* device number of transport */
    294 	struct cm_xprt	*x_next;
    295 
    296 	struct netbuf	x_server;	/* destination address */
    297 	struct netbuf	x_src;		/* src address (for retries) */
    298 	kmutex_t	x_lock;		/* lock on this entry */
    299 	kcondvar_t	x_cv;		/* to signal when can be closed */
    300 	kcondvar_t	x_conn_cv;	/* to signal when connection attempt */
    301 					/* is complete */
    302 	kstat_t		*x_ksp;
    303 
    304 	kcondvar_t	x_dis_cv;	/* to signal when disconnect attempt */
    305 					/* is complete */
    306 	zoneid_t	x_zoneid;	/* zone this xprt belongs to */
    307 };
    308 
    309 typedef struct cm_kstat_xprt {
    310 	kstat_named_t	x_wq;
    311 	kstat_named_t	x_server;
    312 	kstat_named_t	x_family;
    313 	kstat_named_t	x_rdev;
    314 	kstat_named_t	x_time;
    315 	kstat_named_t	x_state;
    316 	kstat_named_t	x_ref;
    317 	kstat_named_t	x_port;
    318 } cm_kstat_xprt_t;
    319 
    320 static cm_kstat_xprt_t cm_kstat_template = {
    321 	{ "write_queue", KSTAT_DATA_UINT32 },
    322 	{ "server",	KSTAT_DATA_STRING },
    323 	{ "addr_family", KSTAT_DATA_UINT32 },
    324 	{ "device",	KSTAT_DATA_UINT32 },
    325 	{ "time_stamp",	KSTAT_DATA_UINT32 },
    326 	{ "status",	KSTAT_DATA_UINT32 },
    327 	{ "ref_count",	KSTAT_DATA_INT32 },
    328 	{ "port",	KSTAT_DATA_UINT32 },
    329 };
    330 
    331 /*
    332  * The inverse of this is connmgr_release().
    333  */
    334 #define	CONN_HOLD(Cm_entry)	{\
    335 	mutex_enter(&(Cm_entry)->x_lock);	\
    336 	(Cm_entry)->x_ref++;	\
    337 	mutex_exit(&(Cm_entry)->x_lock);	\
    338 }
    339 
    340 
    341 /*
    342  * Private data per rpc handle.  This structure is allocated by
    343  * clnt_cots_kcreate, and freed by clnt_cots_kdestroy.
    344  */
    345 typedef struct cku_private_s {
    346 	CLIENT			cku_client;	/* client handle */
    347 	calllist_t		cku_call;	/* for dispatching calls */
    348 	struct rpc_err		cku_err;	/* error status */
    349 
    350 	struct netbuf		cku_srcaddr;	/* source address for retries */
    351 	int			cku_addrfmly;  /* for binding port */
    352 	struct netbuf		cku_addr;	/* remote address */
    353 	dev_t			cku_device;	/* device to use */
    354 	uint_t			cku_flags;
    355 #define	CKU_ONQUEUE		0x1
    356 #define	CKU_SENT		0x2
    357 
    358 	bool_t			cku_progress;	/* for CLSET_PROGRESS */
    359 	uint32_t		cku_xid;	/* current XID */
    360 	clock_t			cku_ctime;	/* time stamp of when */
    361 						/* connection was created */
    362 	uint_t			cku_recv_attempts;
    363 	XDR			cku_outxdr;	/* xdr routine for output */
    364 	XDR			cku_inxdr;	/* xdr routine for input */
    365 	char			cku_rpchdr[WIRE_HDR_SIZE + 4];
    366 						/* pre-serialized rpc header */
    367 
    368 	uint_t			cku_outbuflen;	/* default output mblk length */
    369 	struct cred		*cku_cred;	/* credentials */
    370 	bool_t			cku_nodelayonerr;
    371 						/* for CLSET_NODELAYONERR */
    372 	int			cku_useresvport; /* Use reserved port */
    373 	struct rpc_cots_client	*cku_stats;	/* stats for zone */
    374 } cku_private_t;
    375 
    376 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *,
    377 	const struct timeval *, struct netbuf *, int, struct netbuf *,
    378 	struct rpc_err *, bool_t, bool_t);
    379 
    380 static bool_t	connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *,
    381 				int, calllist_t *, int *, bool_t reconnect,
    382 				const struct timeval *, bool_t);
    383 
    384 static bool_t	connmgr_setopt(queue_t *, int, int, calllist_t *);
    385 static void	connmgr_sndrel(struct cm_xprt *);
    386 static void	connmgr_snddis(struct cm_xprt *);
    387 static void	connmgr_close(struct cm_xprt *);
    388 static void	connmgr_release(struct cm_xprt *);
    389 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *,
    390 	cku_private_t *);
    391 
    392 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *,
    393 	struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t,
    394 	bool_t, int);
    395 
    396 static void connmgr_cancelconn(struct cm_xprt *);
    397 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *,
    398 	bool_t);
    399 static void connmgr_dis_and_wait(struct cm_xprt *);
    400 
    401 static void	clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t,
    402 					uint_t);
    403 
    404 static int clnt_delay(clock_t, bool_t);
    405 
    406 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t);
    407 
    408 /*
    409  * Operations vector for TCP/IP based RPC
    410  */
    411 static struct clnt_ops tcp_ops = {
    412 	clnt_cots_kcallit,	/* do rpc call */
    413 	clnt_cots_kabort,	/* abort call */
    414 	clnt_cots_kerror,	/* return error status */
    415 	clnt_cots_kfreeres,	/* free results */
    416 	clnt_cots_kdestroy,	/* destroy rpc handle */
    417 	clnt_cots_kcontrol,	/* the ioctl() of rpc */
    418 	clnt_cots_ksettimers,	/* set retry timers */
    419 };
    420 
    421 static int rpc_kstat_instance = 0;  /* keeps the current instance */
    422 				/* number for the next kstat_create */
    423 
    424 static struct cm_xprt *cm_hd = NULL;
    425 static kmutex_t connmgr_lock;	/* for connection mngr's list of transports */
    426 
    427 extern kmutex_t clnt_max_msg_lock;
    428 
    429 static calllist_t *clnt_pending = NULL;
    430 extern kmutex_t clnt_pending_lock;
    431 
    432 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE;
    433 
    434 static call_table_t *cots_call_ht;
    435 
    436 static const struct rpc_cots_client {
    437 	kstat_named_t	rccalls;
    438 	kstat_named_t	rcbadcalls;
    439 	kstat_named_t	rcbadxids;
    440 	kstat_named_t	rctimeouts;
    441 	kstat_named_t	rcnewcreds;
    442 	kstat_named_t	rcbadverfs;
    443 	kstat_named_t	rctimers;
    444 	kstat_named_t	rccantconn;
    445 	kstat_named_t	rcnomem;
    446 	kstat_named_t	rcintrs;
    447 } cots_rcstat_tmpl = {
    448 	{ "calls",	KSTAT_DATA_UINT64 },
    449 	{ "badcalls",	KSTAT_DATA_UINT64 },
    450 	{ "badxids",	KSTAT_DATA_UINT64 },
    451 	{ "timeouts",	KSTAT_DATA_UINT64 },
    452 	{ "newcreds",	KSTAT_DATA_UINT64 },
    453 	{ "badverfs",	KSTAT_DATA_UINT64 },
    454 	{ "timers",	KSTAT_DATA_UINT64 },
    455 	{ "cantconn",	KSTAT_DATA_UINT64 },
    456 	{ "nomem",	KSTAT_DATA_UINT64 },
    457 	{ "interrupts", KSTAT_DATA_UINT64 }
    458 };
    459 
    460 #define	COTSRCSTAT_INCR(p, x)	\
    461 	atomic_add_64(&(p)->x.value.ui64, 1)
    462 
    463 #define	CLNT_MAX_CONNS	1	/* concurrent connections between clnt/srvr */
    464 static int clnt_max_conns = CLNT_MAX_CONNS;
    465 
    466 #define	CLNT_MIN_TIMEOUT	10	/* seconds to wait after we get a */
    467 					/* connection reset */
    468 #define	CLNT_MIN_CONNTIMEOUT	5	/* seconds to wait for a connection */
    469 
    470 
    471 static int clnt_cots_min_tout = CLNT_MIN_TIMEOUT;
    472 static int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT;
    473 
    474 /*
    475  * Limit the number of times we will attempt to receive a reply without
    476  * re-sending a response.
    477  */
    478 #define	CLNT_MAXRECV_WITHOUT_RETRY	3
    479 static uint_t clnt_cots_maxrecv	= CLNT_MAXRECV_WITHOUT_RETRY;
    480 
    481 uint_t *clnt_max_msg_sizep;
    482 void (*clnt_stop_idle)(queue_t *wq);
    483 
    484 #define	ptoh(p)		(&((p)->cku_client))
    485 #define	htop(h)		((cku_private_t *)((h)->cl_private))
    486 
    487 /*
    488  * Times to retry
    489  */
    490 #define	REFRESHES	2	/* authentication refreshes */
    491 
    492 /*
    493  * The following is used to determine the global default behavior for
    494  * COTS when binding to a local port.
    495  *
    496  * If the value is set to 1 the default will be to select a reserved
    497  * (aka privileged) port, if the value is zero the default will be to
    498  * use non-reserved ports.  Users of kRPC may override this by using
    499  * CLNT_CONTROL() and CLSET_BINDRESVPORT.
    500  */
    501 static int clnt_cots_do_bindresvport = 1;
    502 
    503 static zone_key_t zone_cots_key;
    504 
    505 /*
    506  * We need to do this after all kernel threads in the zone have exited.
    507  */
    508 /* ARGSUSED */
    509 static void
    510 clnt_zone_destroy(zoneid_t zoneid, void *unused)
    511 {
    512 	struct cm_xprt **cmp;
    513 	struct cm_xprt *cm_entry;
    514 	struct cm_xprt *freelist = NULL;
    515 
    516 	mutex_enter(&connmgr_lock);
    517 	cmp = &cm_hd;
    518 	while ((cm_entry = *cmp) != NULL) {
    519 		if (cm_entry->x_zoneid == zoneid) {
    520 			*cmp = cm_entry->x_next;
    521 			cm_entry->x_next = freelist;
    522 			freelist = cm_entry;
    523 		} else {
    524 			cmp = &cm_entry->x_next;
    525 		}
    526 	}
    527 	mutex_exit(&connmgr_lock);
    528 	while ((cm_entry = freelist) != NULL) {
    529 		freelist = cm_entry->x_next;
    530 		connmgr_close(cm_entry);
    531 	}
    532 }
    533 
    534 int
    535 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog,
    536 	rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl)
    537 {
    538 	CLIENT *h;
    539 	cku_private_t *p;
    540 	struct rpc_msg call_msg;
    541 	struct rpcstat *rpcstat;
    542 
    543 	RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog);
    544 
    545 	rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone());
    546 	ASSERT(rpcstat != NULL);
    547 
    548 	/* Allocate and intialize the client handle. */
    549 	p = kmem_zalloc(sizeof (*p), KM_SLEEP);
    550 
    551 	h = ptoh(p);
    552 
    553 	h->cl_private = (caddr_t)p;
    554 	h->cl_auth = authkern_create();
    555 	h->cl_ops = &tcp_ops;
    556 
    557 	cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL);
    558 	mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL);
    559 
    560 	/*
    561 	 * If the current sanity check size in rpcmod is smaller
    562 	 * than the size needed, then increase the sanity check.
    563 	 */
    564 	if (max_msgsize != 0 && clnt_max_msg_sizep != NULL &&
    565 	    max_msgsize > *clnt_max_msg_sizep) {
    566 		mutex_enter(&clnt_max_msg_lock);
    567 		if (max_msgsize > *clnt_max_msg_sizep)
    568 			*clnt_max_msg_sizep = max_msgsize;
    569 		mutex_exit(&clnt_max_msg_lock);
    570 	}
    571 
    572 	p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE;
    573 
    574 	/* Preserialize the call message header */
    575 
    576 	call_msg.rm_xid = 0;
    577 	call_msg.rm_direction = CALL;
    578 	call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
    579 	call_msg.rm_call.cb_prog = prog;
    580 	call_msg.rm_call.cb_vers = vers;
    581 
    582 	xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE);
    583 
    584 	if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) {
    585 		RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization "
    586 		    "error\n");
    587 		auth_destroy(h->cl_auth);
    588 		kmem_free(p, sizeof (cku_private_t));
    589 		RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n");
    590 		return (EINVAL);		/* XXX */
    591 	}
    592 
    593 	/*
    594 	 * The zalloc initialized the fields below.
    595 	 * p->cku_xid = 0;
    596 	 * p->cku_flags = 0;
    597 	 * p->cku_srcaddr.len = 0;
    598 	 * p->cku_srcaddr.maxlen = 0;
    599 	 */
    600 
    601 	p->cku_cred = cred;
    602 	p->cku_device = dev;
    603 	p->cku_addrfmly = family;
    604 	p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP);
    605 	p->cku_addr.maxlen = addr->maxlen;
    606 	p->cku_addr.len = addr->len;
    607 	bcopy(addr->buf, p->cku_addr.buf, addr->len);
    608 	p->cku_stats = rpcstat->rpc_cots_client;
    609 	p->cku_useresvport = -1; /* value is has not been set */
    610 
    611 	*ncl = h;
    612 	return (0);
    613 }
    614 
    615 /*ARGSUSED*/
    616 static void
    617 clnt_cots_kabort(CLIENT *h)
    618 {
    619 }
    620 
    621 /*
    622  * Return error info on this handle.
    623  */
    624 static void
    625 clnt_cots_kerror(CLIENT *h, struct rpc_err *err)
    626 {
    627 	/* LINTED pointer alignment */
    628 	cku_private_t *p = htop(h);
    629 
    630 	*err = p->cku_err;
    631 }
    632 
    633 static bool_t
    634 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr)
    635 {
    636 	/* LINTED pointer alignment */
    637 	cku_private_t *p = htop(h);
    638 	XDR *xdrs;
    639 
    640 	xdrs = &(p->cku_outxdr);
    641 	xdrs->x_op = XDR_FREE;
    642 	return ((*xdr_res)(xdrs, res_ptr));
    643 }
    644 
    645 static bool_t
    646 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg)
    647 {
    648 	cku_private_t *p = htop(h);
    649 
    650 	switch (cmd) {
    651 	case CLSET_PROGRESS:
    652 		p->cku_progress = TRUE;
    653 		return (TRUE);
    654 
    655 	case CLSET_XID:
    656 		if (arg == NULL)
    657 			return (FALSE);
    658 
    659 		p->cku_xid = *((uint32_t *)arg);
    660 		return (TRUE);
    661 
    662 	case CLGET_XID:
    663 		if (arg == NULL)
    664 			return (FALSE);
    665 
    666 		*((uint32_t *)arg) = p->cku_xid;
    667 		return (TRUE);
    668 
    669 	case CLSET_NODELAYONERR:
    670 		if (arg == NULL)
    671 			return (FALSE);
    672 
    673 		if (*((bool_t *)arg) == TRUE) {
    674 			p->cku_nodelayonerr = TRUE;
    675 			return (TRUE);
    676 		}
    677 		if (*((bool_t *)arg) == FALSE) {
    678 			p->cku_nodelayonerr = FALSE;
    679 			return (TRUE);
    680 		}
    681 		return (FALSE);
    682 
    683 	case CLGET_NODELAYONERR:
    684 		if (arg == NULL)
    685 			return (FALSE);
    686 
    687 		*((bool_t *)arg) = p->cku_nodelayonerr;
    688 		return (TRUE);
    689 
    690 	case CLSET_BINDRESVPORT:
    691 		if (arg == NULL)
    692 			return (FALSE);
    693 
    694 		if (*(int *)arg != 1 && *(int *)arg != 0)
    695 			return (FALSE);
    696 
    697 		p->cku_useresvport = *(int *)arg;
    698 
    699 		return (TRUE);
    700 
    701 	case CLGET_BINDRESVPORT:
    702 		if (arg == NULL)
    703 			return (FALSE);
    704 
    705 		*(int *)arg = p->cku_useresvport;
    706 
    707 		return (TRUE);
    708 
    709 	default:
    710 		return (FALSE);
    711 	}
    712 }
    713 
    714 /*
    715  * Destroy rpc handle.  Frees the space used for output buffer,
    716  * private data, and handle structure.
    717  */
    718 static void
    719 clnt_cots_kdestroy(CLIENT *h)
    720 {
    721 	/* LINTED pointer alignment */
    722 	cku_private_t *p = htop(h);
    723 	calllist_t *call = &p->cku_call;
    724 
    725 	RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h);
    726 	RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid);
    727 
    728 	if (p->cku_flags & CKU_ONQUEUE) {
    729 		RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x "
    730 		    "from dispatch list\n", p->cku_xid);
    731 		call_table_remove(call);
    732 	}
    733 
    734 	if (call->call_reply)
    735 		freemsg(call->call_reply);
    736 	cv_destroy(&call->call_cv);
    737 	mutex_destroy(&call->call_lock);
    738 
    739 	kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen);
    740 	kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
    741 	kmem_free(p, sizeof (*p));
    742 }
    743 
    744 static int clnt_cots_pulls;
    745 #define	RM_HDR_SIZE	4	/* record mark header size */
    746 
    747 /*
    748  * Call remote procedure.
    749  */
    750 static enum clnt_stat
    751 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
    752     caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
    753 {
    754 	/* LINTED pointer alignment */
    755 	cku_private_t *p = htop(h);
    756 	calllist_t *call = &p->cku_call;
    757 	XDR *xdrs;
    758 	struct rpc_msg reply_msg;
    759 	mblk_t *mp;
    760 #ifdef	RPCDEBUG
    761 	clock_t time_sent;
    762 #endif
    763 	struct netbuf *retryaddr;
    764 	struct cm_xprt *cm_entry = NULL;
    765 	queue_t *wq;
    766 	int len;
    767 	int mpsize;
    768 	int refreshes = REFRESHES;
    769 	int interrupted;
    770 	int tidu_size;
    771 	enum clnt_stat status;
    772 	struct timeval cwait;
    773 	bool_t delay_first = FALSE;
    774 	clock_t ticks;
    775 
    776 	RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum);
    777 	COTSRCSTAT_INCR(p->cku_stats, rccalls);
    778 
    779 	RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec);
    780 	RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec);
    781 
    782 	/*
    783 	 * Bug ID 1240234:
    784 	 * Look out for zero length timeouts. We don't want to
    785 	 * wait zero seconds for a connection to be established.
    786 	 */
    787 	if (wait.tv_sec < clnt_cots_min_conntout) {
    788 		cwait.tv_sec = clnt_cots_min_conntout;
    789 		cwait.tv_usec = 0;
    790 		RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,",
    791 		    wait.tv_sec);
    792 		RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout);
    793 	} else {
    794 		cwait = wait;
    795 	}
    796 
    797 call_again:
    798 	if (cm_entry) {
    799 		connmgr_release(cm_entry);
    800 		cm_entry = NULL;
    801 	}
    802 
    803 	mp = NULL;
    804 
    805 	/*
    806 	 * If the call is not a retry, allocate a new xid and cache it
    807 	 * for future retries.
    808 	 * Bug ID 1246045:
    809 	 * Treat call as a retry for purposes of binding the source
    810 	 * port only if we actually attempted to send anything on
    811 	 * the previous call.
    812 	 */
    813 	if (p->cku_xid == 0) {
    814 		p->cku_xid = alloc_xid();
    815 		/*
    816 		 * We need to ASSERT here that our xid != 0 because this
    817 		 * determines whether or not our call record gets placed on
    818 		 * the hash table or the linked list.  By design, we mandate
    819 		 * that RPC calls over cots must have xid's != 0, so we can
    820 		 * ensure proper management of the hash table.
    821 		 */
    822 		ASSERT(p->cku_xid != 0);
    823 
    824 		retryaddr = NULL;
    825 		p->cku_flags &= ~CKU_SENT;
    826 
    827 		if (p->cku_flags & CKU_ONQUEUE) {
    828 			RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old"
    829 			    " one (%p)\n", (void *)call);
    830 			call_table_remove(call);
    831 			p->cku_flags &= ~CKU_ONQUEUE;
    832 			RPCLOG(64, "clnt_cots_kcallit: removing call from "
    833 			    "dispatch list because xid was zero (now 0x%x)\n",
    834 			    p->cku_xid);
    835 		}
    836 
    837 		if (call->call_reply != NULL) {
    838 			freemsg(call->call_reply);
    839 			call->call_reply = NULL;
    840 		}
    841 	} else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) {
    842 		retryaddr = NULL;
    843 
    844 	} else if (p->cku_flags & CKU_SENT) {
    845 		retryaddr = &p->cku_srcaddr;
    846 
    847 	} else {
    848 		/*
    849 		 * Bug ID 1246045: Nothing was sent, so set retryaddr to
    850 		 * NULL and let connmgr_get() bind to any source port it
    851 		 * can get.
    852 		 */
    853 		retryaddr = NULL;
    854 	}
    855 
    856 	RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid);
    857 	RPCLOG(64, " flags = 0x%x\n", p->cku_flags);
    858 
    859 	p->cku_err.re_status = RPC_TIMEDOUT;
    860 	p->cku_err.re_errno = p->cku_err.re_terrno = 0;
    861 
    862 	cm_entry = connmgr_wrapget(retryaddr, &cwait, p);
    863 
    864 	if (cm_entry == NULL) {
    865 		RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n",
    866 		    clnt_sperrno(p->cku_err.re_status));
    867 
    868 		/*
    869 		 * The reasons why we fail to create a connection are
    870 		 * varied. In most cases we don't want the caller to
    871 		 * immediately retry. This could have one or more
    872 		 * bad effects. This includes flooding the net with
    873 		 * connect requests to ports with no listener; a hard
    874 		 * kernel loop due to all the "reserved" TCP ports being
    875 		 * in use.
    876 		 */
    877 		delay_first = TRUE;
    878 
    879 		/*
    880 		 * Even if we end up returning EINTR, we still count a
    881 		 * a "can't connect", because the connection manager
    882 		 * might have been committed to waiting for or timing out on
    883 		 * a connection.
    884 		 */
    885 		COTSRCSTAT_INCR(p->cku_stats, rccantconn);
    886 		switch (p->cku_err.re_status) {
    887 		case RPC_INTR:
    888 			p->cku_err.re_errno = EINTR;
    889 
    890 			/*
    891 			 * No need to delay because a UNIX signal(2)
    892 			 * interrupted us. The caller likely won't
    893 			 * retry the CLNT_CALL() and even if it does,
    894 			 * we assume the caller knows what it is doing.
    895 			 */
    896 			delay_first = FALSE;
    897 			break;
    898 
    899 		case RPC_TIMEDOUT:
    900 			p->cku_err.re_errno = ETIMEDOUT;
    901 
    902 			/*
    903 			 * No need to delay because timed out already
    904 			 * on the connection request and assume that the
    905 			 * transport time out is longer than our minimum
    906 			 * timeout, or least not too much smaller.
    907 			 */
    908 			delay_first = FALSE;
    909 			break;
    910 
    911 		case RPC_SYSTEMERROR:
    912 		case RPC_TLIERROR:
    913 			/*
    914 			 * We want to delay here because a transient
    915 			 * system error has a better chance of going away
    916 			 * if we delay a bit. If it's not transient, then
    917 			 * we don't want end up in a hard kernel loop
    918 			 * due to retries.
    919 			 */
    920 			ASSERT(p->cku_err.re_errno != 0);
    921 			break;
    922 
    923 
    924 		case RPC_CANTCONNECT:
    925 			/*
    926 			 * RPC_CANTCONNECT is set on T_ERROR_ACK which
    927 			 * implies some error down in the TCP layer or
    928 			 * below. If cku_nodelayonerror is set then we
    929 			 * assume the caller knows not to try too hard.
    930 			 */
    931 			RPCLOG0(8, "clnt_cots_kcallit: connection failed,");
    932 			RPCLOG0(8, " re_status=RPC_CANTCONNECT,");
    933 			RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno);
    934 			RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr);
    935 			if (p->cku_nodelayonerr == TRUE)
    936 				delay_first = FALSE;
    937 
    938 			p->cku_err.re_errno = EIO;
    939 
    940 			break;
    941 
    942 		case RPC_XPRTFAILED:
    943 			/*
    944 			 * We want to delay here because we likely
    945 			 * got a refused connection.
    946 			 */
    947 			if (p->cku_err.re_errno == 0)
    948 				p->cku_err.re_errno = EIO;
    949 
    950 			RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n",
    951 			    p->cku_err.re_errno);
    952 
    953 			break;
    954 
    955 		default:
    956 			/*
    957 			 * We delay here because it is better to err
    958 			 * on the side of caution. If we got here then
    959 			 * status could have been RPC_SUCCESS, but we
    960 			 * know that we did not get a connection, so
    961 			 * force the rpc status to RPC_CANTCONNECT.
    962 			 */
    963 			p->cku_err.re_status = RPC_CANTCONNECT;
    964 			p->cku_err.re_errno = EIO;
    965 			break;
    966 		}
    967 		if (delay_first == TRUE)
    968 			ticks = clnt_cots_min_tout * drv_usectohz(1000000);
    969 		goto cots_done;
    970 	}
    971 
    972 	/*
    973 	 * If we've never sent any request on this connection (send count
    974 	 * is zero, or the connection has been reset), cache the
    975 	 * the connection's create time and send a request (possibly a retry)
    976 	 */
    977 	if ((p->cku_flags & CKU_SENT) == 0 ||
    978 	    p->cku_ctime != cm_entry->x_ctime) {
    979 		p->cku_ctime = cm_entry->x_ctime;
    980 
    981 	} else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) &&
    982 	    (call->call_reply != NULL ||
    983 	    p->cku_recv_attempts < clnt_cots_maxrecv)) {
    984 
    985 		/*
    986 		 * If we've sent a request and our call is on the dispatch
    987 		 * queue and we haven't made too many receive attempts, then
    988 		 * don't re-send, just receive.
    989 		 */
    990 		p->cku_recv_attempts++;
    991 		goto read_again;
    992 	}
    993 
    994 	/*
    995 	 * Now we create the RPC request in a STREAMS message.  We have to do
    996 	 * this after the call to connmgr_get so that we have the correct
    997 	 * TIDU size for the transport.
    998 	 */
    999 	tidu_size = cm_entry->x_tidu_size;
   1000 	len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE);
   1001 
   1002 	while ((mp = allocb(len, BPRI_MED)) == NULL) {
   1003 		if (strwaitbuf(len, BPRI_MED)) {
   1004 			p->cku_err.re_status = RPC_SYSTEMERROR;
   1005 			p->cku_err.re_errno = ENOSR;
   1006 			COTSRCSTAT_INCR(p->cku_stats, rcnomem);
   1007 			goto cots_done;
   1008 		}
   1009 	}
   1010 	xdrs = &p->cku_outxdr;
   1011 	xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size);
   1012 	mpsize = MBLKSIZE(mp);
   1013 	ASSERT(mpsize >= len);
   1014 	ASSERT(mp->b_rptr == mp->b_datap->db_base);
   1015 
   1016 	/*
   1017 	 * If the size of mblk is not appreciably larger than what we
   1018 	 * asked, then resize the mblk to exactly len bytes. The reason for
   1019 	 * this: suppose len is 1600 bytes, the tidu is 1460 bytes
   1020 	 * (from TCP over ethernet), and the arguments to the RPC require
   1021 	 * 2800 bytes. Ideally we want the protocol to render two
   1022 	 * ~1400 byte segments over the wire. However if allocb() gives us a 2k
   1023 	 * mblk, and we allocate a second mblk for the remainder, the protocol
   1024 	 * module may generate 3 segments over the wire:
   1025 	 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and
   1026 	 * 892 for the third. If we "waste" 448 bytes in the first mblk,
   1027 	 * the XDR encoding will generate two ~1400 byte mblks, and the
   1028 	 * protocol module is more likely to produce properly sized segments.
   1029 	 */
   1030 	if ((mpsize >> 1) <= len)
   1031 		mp->b_rptr += (mpsize - len);
   1032 
   1033 	/*
   1034 	 * Adjust b_rptr to reserve space for the non-data protocol headers
   1035 	 * any downstream modules might like to add, and for the
   1036 	 * record marking header.
   1037 	 */
   1038 	mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE);
   1039 
   1040 	if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
   1041 		/* Copy in the preserialized RPC header information. */
   1042 		bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE);
   1043 
   1044 		/* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */
   1045 		XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base +
   1046 		    WIRE_HDR_SIZE));
   1047 
   1048 		ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE);
   1049 
   1050 		/* Serialize the procedure number and the arguments. */
   1051 		if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
   1052 		    (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
   1053 		    (!(*xdr_args)(xdrs, argsp))) {
   1054 			p->cku_err.re_status = RPC_CANTENCODEARGS;
   1055 			p->cku_err.re_errno = EIO;
   1056 			goto cots_done;
   1057 		}
   1058 
   1059 		(*(uint32_t *)(mp->b_rptr)) = p->cku_xid;
   1060 	} else {
   1061 		uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE];
   1062 		IXDR_PUT_U_INT32(uproc, procnum);
   1063 
   1064 		(*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
   1065 
   1066 		/* Use XDR_SETPOS() to set the b_wptr. */
   1067 		XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base));
   1068 
   1069 		/* Serialize the procedure number and the arguments. */
   1070 		if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4,
   1071 		    xdrs, xdr_args, argsp)) {
   1072 			p->cku_err.re_status = RPC_CANTENCODEARGS;
   1073 			p->cku_err.re_errno = EIO;
   1074 			goto cots_done;
   1075 		}
   1076 	}
   1077 
   1078 	RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n",
   1079 	    tidu_size);
   1080 
   1081 	wq = cm_entry->x_wq;
   1082 	clnt_dispatch_send(wq, mp, call, p->cku_xid,
   1083 				(p->cku_flags & CKU_ONQUEUE));
   1084 
   1085 	RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n",
   1086 		(uint_t)p->cku_xid);
   1087 	p->cku_flags = (CKU_ONQUEUE|CKU_SENT);
   1088 	p->cku_recv_attempts = 1;
   1089 
   1090 #ifdef	RPCDEBUG
   1091 	time_sent = lbolt;
   1092 #endif
   1093 
   1094 	/*
   1095 	 * Wait for a reply or a timeout.  If there is no error or timeout,
   1096 	 * (both indicated by call_status), call->call_reply will contain
   1097 	 * the RPC reply message.
   1098 	 */
   1099 read_again:
   1100 	mutex_enter(&call->call_lock);
   1101 	interrupted = 0;
   1102 	if (call->call_status == RPC_TIMEDOUT) {
   1103 		/*
   1104 		 * Indicate that the lwp is not to be stopped while waiting
   1105 		 * for this network traffic.  This is to avoid deadlock while
   1106 		 * debugging a process via /proc and also to avoid recursive
   1107 		 * mutex_enter()s due to NFS page faults while stopping
   1108 		 * (NFS holds locks when it calls here).
   1109 		 */
   1110 		clock_