1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T 28 * All Rights Reserved 29 */ 30 31 /* 32 * Portions of this source code were derived from Berkeley 4.3 BSD 33 * under license from the Regents of the University of California. 34 */ 35 36 #pragma ident "@(#)clnt_cots.c 1.125 07/06/04 SMI" 37 38 /* 39 * Implements a kernel based, client side RPC over Connection Oriented 40 * Transports (COTS). 41 */ 42 43 /* 44 * Much of this file has been re-written to let NFS work better over slow 45 * transports. A description follows. 46 * 47 * One of the annoying things about kRPC/COTS is that it will temporarily 48 * create more than one connection between a client and server. This 49 * happens because when a connection is made, the end-points entry in the 50 * linked list of connections (headed by cm_hd), is removed so that other 51 * threads don't mess with it. Went ahead and bit the bullet by keeping 52 * the endpoint on the connection list and introducing state bits, 53 * condition variables etc. to the connection entry data structure (struct 54 * cm_xprt). 55 * 56 * Here is a summary of the changes to cm-xprt: 57 * 58 * x_ctime is the timestamp of when the endpoint was last 59 * connected or disconnected. If an end-point is ever disconnected 60 * or re-connected, then any outstanding RPC request is presumed 61 * lost, telling clnt_cots_kcallit that it needs to re-send the 62 * request, not just wait for the original request's reply to 63 * arrive. 64 * 65 * x_thread flag which tells us if a thread is doing a connection attempt. 66 * 67 * x_waitdis flag which tells us we are waiting a disconnect ACK. 68 * 69 * x_needdis flag which tells us we need to send a T_DISCONN_REQ 70 * to kill the connection. 71 * 72 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to 73 * gracefully close the connection. 74 * 75 * #defined bitmasks for the all the b_* bits so that more 76 * efficient (and at times less clumsy) masks can be used to 77 * manipulated state in cases where multiple bits have to 78 * set/cleared/checked in the same critical section. 79 * 80 * x_conn_cv and x_dis-_cv are new condition variables to let 81 * threads knows when the connection attempt is done, and to let 82 * the connecting thread know when the disconnect handshake is 83 * done. 84 * 85 * Added the CONN_HOLD() macro so that all reference holds have the same 86 * look and feel. 87 * 88 * In the private (cku_private) portion of the client handle, 89 * 90 * cku_flags replaces the cku_sent a boolean. cku_flags keeps 91 * track of whether a request as been sent, and whether the 92 * client's handles call record is on the dispatch list (so that 93 * the reply can be matched by XID to the right client handle). 94 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit() 95 * and still have the response find the right client handle so 96 * that the retry of CLNT_CALL() gets the result. Testing, found 97 * situations where if the timeout was increased, performance 98 * degraded. This was due to us hitting a window where the thread 99 * was back in rfscall() (probably printing server not responding) 100 * while the response came back but no place to put it. 101 * 102 * cku_ctime is just a cache of x_ctime. If they match, 103 * clnt_cots_kcallit() won't to send a retry (unless the maximum 104 * receive count limit as been reached). If the don't match, then 105 * we assume the request has been lost, and a retry of the request 106 * is needed. 107 * 108 * cku_recv_attempts counts the number of receive count attempts 109 * after one try is sent on the wire. 110 * 111 * Added the clnt_delay() routine so that interruptible and 112 * noninterruptible delays are possible. 113 * 114 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to 115 * control how long the client delays before returned after getting 116 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash 117 * a server that may be booting and not yet started nfsd. 118 * 119 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable) 120 * Why don't we just wait forever (receive an infinite # of times)? 121 * Because the server may have rebooted. More insidious is that some 122 * servers (ours) will drop NFS/TCP requests in some cases. This is bad, 123 * but it is a reality. 124 * 125 * The case of a server doing orderly release really messes up the 126 * client's recovery, especially if the server's TCP implementation is 127 * buggy. It was found was that the kRPC/COTS client was breaking some 128 * TPI rules, such as not waiting for the acknowledgement of a 129 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and 130 * T_DISCON_REQ in clnt_dispatch_notifyall()). 131 * 132 * One of things that we've seen is that a kRPC TCP endpoint goes into 133 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because 134 * that the TIMEWAIT state takes a while to finish. If a server sends a 135 * T_ORDREL_IND, there is little point in an RPC client doing a 136 * T_ORDREL_REQ, because the RPC request isn't going to make it (the 137 * server is saying that it won't accept any more data). So kRPC was 138 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the 139 * connection skips the TIMEWAIT state and goes straight to a bound state 140 * that kRPC can quickly switch to connected. 141 * 142 * Code that issues TPI request must use waitforack() to wait for the 143 * corresponding ack (assuming there is one) in any future modifications. 144 * This works around problems that may be introduced by breaking TPI rules 145 * (by submitting new calls before earlier requests have been acked) in the 146 * case of a signal or other early return. waitforack() depends on 147 * clnt_dispatch_notifyconn() to issue the wakeup when the ack 148 * arrives, so adding new TPI calls may require corresponding changes 149 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on 150 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure 151 * not to set it too low or TPI ACKS will be lost. 152 */ 153 154 #include <sys/param.h> 155 #include <sys/types.h> 156 #include <sys/user.h> 157 #include <sys/systm.h> 158 #include <sys/sysmacros.h> 159 #include <sys/proc.h> 160 #include <sys/socket.h> 161 #include <sys/file.h> 162 #include <sys/stream.h> 163 #include <sys/strsubr.h> 164 #include <sys/stropts.h> 165 #include <sys/strsun.h> 166 #include <sys/timod.h> 167 #include <sys/tiuser.h> 168 #include <sys/tihdr.h> 169 #include <sys/t_kuser.h> 170 #include <sys/fcntl.h> 171 #include <sys/errno.h> 172 #include <sys/kmem.h> 173 #include <sys/debug.h> 174 #include <sys/systm.h> 175 #include <sys/kstat.h> 176 #include <sys/t_lock.h> 177 #include <sys/ddi.h> 178 #include <sys/cmn_err.h> 179 #include <sys/time.h> 180 #include <sys/isa_defs.h> 181 #include <sys/callb.h> 182 #include <sys/sunddi.h> 183 #include <sys/atomic.h> 184 185 #include <netinet/in.h> 186 #include <netinet/tcp.h> 187 188 #include <rpc/types.h> 189 #include <rpc/xdr.h> 190 #include <rpc/auth.h> 191 #include <rpc/clnt.h> 192 #include <rpc/rpc_msg.h> 193 194 #define COTS_DEFAULT_ALLOCSIZE 2048 195 196 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */ 197 #define MSG_OFFSET 128 /* offset of call into the mblk */ 198 199 const char *kinet_ntop6(uchar_t *, char *, size_t); 200 201 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *, 202 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t); 203 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 204 caddr_t, xdrproc_t, caddr_t, struct timeval); 205 static void clnt_cots_kabort(CLIENT *); 206 static void clnt_cots_kerror(CLIENT *, struct rpc_err *); 207 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t); 208 static void clnt_cots_kdestroy(CLIENT *); 209 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *); 210 211 212 /* List of transports managed by the connection manager. */ 213 struct cm_xprt { 214 TIUSER *x_tiptr; /* transport handle */ 215 queue_t *x_wq; /* send queue */ 216 clock_t x_time; /* last time we handed this xprt out */ 217 clock_t x_ctime; /* time we went to CONNECTED */ 218 int x_tidu_size; /* TIDU size of this transport */ 219 union { 220 struct { 221 unsigned int 222 #ifdef _BIT_FIELDS_HTOL 223 b_closing: 1, /* we've sent a ord rel on this conn */ 224 b_dead: 1, /* transport is closed or disconn */ 225 b_doomed: 1, /* too many conns, let this go idle */ 226 b_connected: 1, /* this connection is connected */ 227 228 b_ordrel: 1, /* do an orderly release? */ 229 b_thread: 1, /* thread doing connect */ 230 b_waitdis: 1, /* waiting for disconnect ACK */ 231 b_needdis: 1, /* need T_DISCON_REQ */ 232 233 b_needrel: 1, /* need T_ORDREL_REQ */ 234 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 235 /* disconnect during connect */ 236 237 b_pad: 22; 238 239 #endif 240 241 #ifdef _BIT_FIELDS_LTOH 242 b_pad: 22, 243 244 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */ 245 /* disconnect during connect */ 246 b_needrel: 1, /* need T_ORDREL_REQ */ 247 248 b_needdis: 1, /* need T_DISCON_REQ */ 249 b_waitdis: 1, /* waiting for disconnect ACK */ 250 b_thread: 1, /* thread doing connect */ 251 b_ordrel: 1, /* do an orderly release? */ 252 253 b_connected: 1, /* this connection is connected */ 254 b_doomed: 1, /* too many conns, let this go idle */ 255 b_dead: 1, /* transport is closed or disconn */ 256 b_closing: 1; /* we've sent a ord rel on this conn */ 257 #endif 258 } bit; unsigned int word; 259 260 #define x_closing x_state.bit.b_closing 261 #define x_dead x_state.bit.b_dead 262 #define x_doomed x_state.bit.b_doomed 263 #define x_connected x_state.bit.b_connected 264 265 #define x_ordrel x_state.bit.b_ordrel 266 #define x_thread x_state.bit.b_thread 267 #define x_waitdis x_state.bit.b_waitdis 268 #define x_needdis x_state.bit.b_needdis 269 270 #define x_needrel x_state.bit.b_needrel 271 #define x_early_disc x_state.bit.b_early_disc 272 273 #define x_state_flags x_state.word 274 275 #define X_CLOSING 0x80000000 276 #define X_DEAD 0x40000000 277 #define X_DOOMED 0x20000000 278 #define X_CONNECTED 0x10000000 279 280 #define X_ORDREL 0x08000000 281 #define X_THREAD 0x04000000 282 #define X_WAITDIS 0x02000000 283 #define X_NEEDDIS 0x01000000 284 285 #define X_NEEDREL 0x00800000 286 #define X_EARLYDISC 0x00400000 287 288 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED) 289 290 } x_state; 291 int x_ref; /* number of users of this xprt */ 292 int x_family; /* address family of transport */ 293 dev_t x_rdev; /* device number of transport */ 294 struct cm_xprt *x_next; 295 296 struct netbuf x_server; /* destination address */ 297 struct netbuf x_src; /* src address (for retries) */ 298 kmutex_t x_lock; /* lock on this entry */ 299 kcondvar_t x_cv; /* to signal when can be closed */ 300 kcondvar_t x_conn_cv; /* to signal when connection attempt */ 301 /* is complete */ 302 kstat_t *x_ksp; 303 304 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */ 305 /* is complete */ 306 zoneid_t x_zoneid; /* zone this xprt belongs to */ 307 }; 308 309 typedef struct cm_kstat_xprt { 310 kstat_named_t x_wq; 311 kstat_named_t x_server; 312 kstat_named_t x_family; 313 kstat_named_t x_rdev; 314 kstat_named_t x_time; 315 kstat_named_t x_state; 316 kstat_named_t x_ref; 317 kstat_named_t x_port; 318 } cm_kstat_xprt_t; 319 320 static cm_kstat_xprt_t cm_kstat_template = { 321 { "write_queue", KSTAT_DATA_UINT32 }, 322 { "server", KSTAT_DATA_STRING }, 323 { "addr_family", KSTAT_DATA_UINT32 }, 324 { "device", KSTAT_DATA_UINT32 }, 325 { "time_stamp", KSTAT_DATA_UINT32 }, 326 { "status", KSTAT_DATA_UINT32 }, 327 { "ref_count", KSTAT_DATA_INT32 }, 328 { "port", KSTAT_DATA_UINT32 }, 329 }; 330 331 /* 332 * The inverse of this is connmgr_release(). 333 */ 334 #define CONN_HOLD(Cm_entry) {\ 335 mutex_enter(&(Cm_entry)->x_lock); \ 336 (Cm_entry)->x_ref++; \ 337 mutex_exit(&(Cm_entry)->x_lock); \ 338 } 339 340 341 /* 342 * Private data per rpc handle. This structure is allocated by 343 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy. 344 */ 345 typedef struct cku_private_s { 346 CLIENT cku_client; /* client handle */ 347 calllist_t cku_call; /* for dispatching calls */ 348 struct rpc_err cku_err; /* error status */ 349 350 struct netbuf cku_srcaddr; /* source address for retries */ 351 int cku_addrfmly; /* for binding port */ 352 struct netbuf cku_addr; /* remote address */ 353 dev_t cku_device; /* device to use */ 354 uint_t cku_flags; 355 #define CKU_ONQUEUE 0x1 356 #define CKU_SENT 0x2 357 358 bool_t cku_progress; /* for CLSET_PROGRESS */ 359 uint32_t cku_xid; /* current XID */ 360 clock_t cku_ctime; /* time stamp of when */ 361 /* connection was created */ 362 uint_t cku_recv_attempts; 363 XDR cku_outxdr; /* xdr routine for output */ 364 XDR cku_inxdr; /* xdr routine for input */ 365 char cku_rpchdr[WIRE_HDR_SIZE + 4]; 366 /* pre-serialized rpc header */ 367 368 uint_t cku_outbuflen; /* default output mblk length */ 369 struct cred *cku_cred; /* credentials */ 370 bool_t cku_nodelayonerr; 371 /* for CLSET_NODELAYONERR */ 372 int cku_useresvport; /* Use reserved port */ 373 struct rpc_cots_client *cku_stats; /* stats for zone */ 374 } cku_private_t; 375 376 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *, 377 const struct timeval *, struct netbuf *, int, struct netbuf *, 378 struct rpc_err *, bool_t, bool_t); 379 380 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *, 381 int, calllist_t *, int *, bool_t reconnect, 382 const struct timeval *, bool_t); 383 384 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *); 385 static void connmgr_sndrel(struct cm_xprt *); 386 static void connmgr_snddis(struct cm_xprt *); 387 static void connmgr_close(struct cm_xprt *); 388 static void connmgr_release(struct cm_xprt *); 389 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *, 390 cku_private_t *); 391 392 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *, 393 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t, 394 bool_t, int); 395 396 static void connmgr_cancelconn(struct cm_xprt *); 397 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *, 398 bool_t); 399 static void connmgr_dis_and_wait(struct cm_xprt *); 400 401 static void clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t, 402 uint_t); 403 404 static int clnt_delay(clock_t, bool_t); 405 406 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t); 407 408 /* 409 * Operations vector for TCP/IP based RPC 410 */ 411 static struct clnt_ops tcp_ops = { 412 clnt_cots_kcallit, /* do rpc call */ 413 clnt_cots_kabort, /* abort call */ 414 clnt_cots_kerror, /* return error status */ 415 clnt_cots_kfreeres, /* free results */ 416 clnt_cots_kdestroy, /* destroy rpc handle */ 417 clnt_cots_kcontrol, /* the ioctl() of rpc */ 418 clnt_cots_ksettimers, /* set retry timers */ 419 }; 420 421 static int rpc_kstat_instance = 0; /* keeps the current instance */ 422 /* number for the next kstat_create */ 423 424 static struct cm_xprt *cm_hd = NULL; 425 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */ 426 427 extern kmutex_t clnt_max_msg_lock; 428 429 static calllist_t *clnt_pending = NULL; 430 extern kmutex_t clnt_pending_lock; 431 432 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE; 433 434 static call_table_t *cots_call_ht; 435 436 static const struct rpc_cots_client { 437 kstat_named_t rccalls; 438 kstat_named_t rcbadcalls; 439 kstat_named_t rcbadxids; 440 kstat_named_t rctimeouts; 441 kstat_named_t rcnewcreds; 442 kstat_named_t rcbadverfs; 443 kstat_named_t rctimers; 444 kstat_named_t rccantconn; 445 kstat_named_t rcnomem; 446 kstat_named_t rcintrs; 447 } cots_rcstat_tmpl = { 448 { "calls", KSTAT_DATA_UINT64 }, 449 { "badcalls", KSTAT_DATA_UINT64 }, 450 { "badxids", KSTAT_DATA_UINT64 }, 451 { "timeouts", KSTAT_DATA_UINT64 }, 452 { "newcreds", KSTAT_DATA_UINT64 }, 453 { "badverfs", KSTAT_DATA_UINT64 }, 454 { "timers", KSTAT_DATA_UINT64 }, 455 { "cantconn", KSTAT_DATA_UINT64 }, 456 { "nomem", KSTAT_DATA_UINT64 }, 457 { "interrupts", KSTAT_DATA_UINT64 } 458 }; 459 460 #define COTSRCSTAT_INCR(p, x) \ 461 atomic_add_64(&(p)->x.value.ui64, 1) 462 463 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */ 464 static int clnt_max_conns = CLNT_MAX_CONNS; 465 466 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */ 467 /* connection reset */ 468 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */ 469 470 471 static int clnt_cots_min_tout = CLNT_MIN_TIMEOUT; 472 static int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT; 473 474 /* 475 * Limit the number of times we will attempt to receive a reply without 476 * re-sending a response. 477 */ 478 #define CLNT_MAXRECV_WITHOUT_RETRY 3 479 static uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY; 480 481 uint_t *clnt_max_msg_sizep; 482 void (*clnt_stop_idle)(queue_t *wq); 483 484 #define ptoh(p) (&((p)->cku_client)) 485 #define htop(h) ((cku_private_t *)((h)->cl_private)) 486 487 /* 488 * Times to retry 489 */ 490 #define REFRESHES 2 /* authentication refreshes */ 491 492 /* 493 * The following is used to determine the global default behavior for 494 * COTS when binding to a local port. 495 * 496 * If the value is set to 1 the default will be to select a reserved 497 * (aka privileged) port, if the value is zero the default will be to 498 * use non-reserved ports. Users of kRPC may override this by using 499 * CLNT_CONTROL() and CLSET_BINDRESVPORT. 500 */ 501 static int clnt_cots_do_bindresvport = 1; 502 503 static zone_key_t zone_cots_key; 504 505 /* 506 * We need to do this after all kernel threads in the zone have exited. 507 */ 508 /* ARGSUSED */ 509 static void 510 clnt_zone_destroy(zoneid_t zoneid, void *unused) 511 { 512 struct cm_xprt **cmp; 513 struct cm_xprt *cm_entry; 514 struct cm_xprt *freelist = NULL; 515 516 mutex_enter(&connmgr_lock); 517 cmp = &cm_hd; 518 while ((cm_entry = *cmp) != NULL) { 519 if (cm_entry->x_zoneid == zoneid) { 520 *cmp = cm_entry->x_next; 521 cm_entry->x_next = freelist; 522 freelist = cm_entry; 523 } else { 524 cmp = &cm_entry->x_next; 525 } 526 } 527 mutex_exit(&connmgr_lock); 528 while ((cm_entry = freelist) != NULL) { 529 freelist = cm_entry->x_next; 530 connmgr_close(cm_entry); 531 } 532 } 533 534 int 535 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog, 536 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl) 537 { 538 CLIENT *h; 539 cku_private_t *p; 540 struct rpc_msg call_msg; 541 struct rpcstat *rpcstat; 542 543 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog); 544 545 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone()); 546 ASSERT(rpcstat != NULL); 547 548 /* Allocate and intialize the client handle. */ 549 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 550 551 h = ptoh(p); 552 553 h->cl_private = (caddr_t)p; 554 h->cl_auth = authkern_create(); 555 h->cl_ops = &tcp_ops; 556 557 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL); 558 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL); 559 560 /* 561 * If the current sanity check size in rpcmod is smaller 562 * than the size needed, then increase the sanity check. 563 */ 564 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL && 565 max_msgsize > *clnt_max_msg_sizep) { 566 mutex_enter(&clnt_max_msg_lock); 567 if (max_msgsize > *clnt_max_msg_sizep) 568 *clnt_max_msg_sizep = max_msgsize; 569 mutex_exit(&clnt_max_msg_lock); 570 } 571 572 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE; 573 574 /* Preserialize the call message header */ 575 576 call_msg.rm_xid = 0; 577 call_msg.rm_direction = CALL; 578 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 579 call_msg.rm_call.cb_prog = prog; 580 call_msg.rm_call.cb_vers = vers; 581 582 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE); 583 584 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 585 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization " 586 "error\n"); 587 auth_destroy(h->cl_auth); 588 kmem_free(p, sizeof (cku_private_t)); 589 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n"); 590 return (EINVAL); /* XXX */ 591 } 592 593 /* 594 * The zalloc initialized the fields below. 595 * p->cku_xid = 0; 596 * p->cku_flags = 0; 597 * p->cku_srcaddr.len = 0; 598 * p->cku_srcaddr.maxlen = 0; 599 */ 600 601 p->cku_cred = cred; 602 p->cku_device = dev; 603 p->cku_addrfmly = family; 604 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP); 605 p->cku_addr.maxlen = addr->maxlen; 606 p->cku_addr.len = addr->len; 607 bcopy(addr->buf, p->cku_addr.buf, addr->len); 608 p->cku_stats = rpcstat->rpc_cots_client; 609 p->cku_useresvport = -1; /* value is has not been set */ 610 611 *ncl = h; 612 return (0); 613 } 614 615 /*ARGSUSED*/ 616 static void 617 clnt_cots_kabort(CLIENT *h) 618 { 619 } 620 621 /* 622 * Return error info on this handle. 623 */ 624 static void 625 clnt_cots_kerror(CLIENT *h, struct rpc_err *err) 626 { 627 /* LINTED pointer alignment */ 628 cku_private_t *p = htop(h); 629 630 *err = p->cku_err; 631 } 632 633 static bool_t 634 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 635 { 636 /* LINTED pointer alignment */ 637 cku_private_t *p = htop(h); 638 XDR *xdrs; 639 640 xdrs = &(p->cku_outxdr); 641 xdrs->x_op = XDR_FREE; 642 return ((*xdr_res)(xdrs, res_ptr)); 643 } 644 645 static bool_t 646 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg) 647 { 648 cku_private_t *p = htop(h); 649 650 switch (cmd) { 651 case CLSET_PROGRESS: 652 p->cku_progress = TRUE; 653 return (TRUE); 654 655 case CLSET_XID: 656 if (arg == NULL) 657 return (FALSE); 658 659 p->cku_xid = *((uint32_t *)arg); 660 return (TRUE); 661 662 case CLGET_XID: 663 if (arg == NULL) 664 return (FALSE); 665 666 *((uint32_t *)arg) = p->cku_xid; 667 return (TRUE); 668 669 case CLSET_NODELAYONERR: 670 if (arg == NULL) 671 return (FALSE); 672 673 if (*((bool_t *)arg) == TRUE) { 674 p->cku_nodelayonerr = TRUE; 675 return (TRUE); 676 } 677 if (*((bool_t *)arg) == FALSE) { 678 p->cku_nodelayonerr = FALSE; 679 return (TRUE); 680 } 681 return (FALSE); 682 683 case CLGET_NODELAYONERR: 684 if (arg == NULL) 685 return (FALSE); 686 687 *((bool_t *)arg) = p->cku_nodelayonerr; 688 return (TRUE); 689 690 case CLSET_BINDRESVPORT: 691 if (arg == NULL) 692 return (FALSE); 693 694 if (*(int *)arg != 1 && *(int *)arg != 0) 695 return (FALSE); 696 697 p->cku_useresvport = *(int *)arg; 698 699 return (TRUE); 700 701 case CLGET_BINDRESVPORT: 702 if (arg == NULL) 703 return (FALSE); 704 705 *(int *)arg = p->cku_useresvport; 706 707 return (TRUE); 708 709 default: 710 return (FALSE); 711 } 712 } 713 714 /* 715 * Destroy rpc handle. Frees the space used for output buffer, 716 * private data, and handle structure. 717 */ 718 static void 719 clnt_cots_kdestroy(CLIENT *h) 720 { 721 /* LINTED pointer alignment */ 722 cku_private_t *p = htop(h); 723 calllist_t *call = &p->cku_call; 724 725 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h); 726 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid); 727 728 if (p->cku_flags & CKU_ONQUEUE) { 729 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x " 730 "from dispatch list\n", p->cku_xid); 731 call_table_remove(call); 732 } 733 734 if (call->call_reply) 735 freemsg(call->call_reply); 736 cv_destroy(&call->call_cv); 737 mutex_destroy(&call->call_lock); 738 739 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); 740 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 741 kmem_free(p, sizeof (*p)); 742 } 743 744 static int clnt_cots_pulls; 745 #define RM_HDR_SIZE 4 /* record mark header size */ 746 747 /* 748 * Call remote procedure. 749 */ 750 static enum clnt_stat 751 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 752 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) 753 { 754 /* LINTED pointer alignment */ 755 cku_private_t *p = htop(h); 756 calllist_t *call = &p->cku_call; 757 XDR *xdrs; 758 struct rpc_msg reply_msg; 759 mblk_t *mp; 760 #ifdef RPCDEBUG 761 clock_t time_sent; 762 #endif 763 struct netbuf *retryaddr; 764 struct cm_xprt *cm_entry = NULL; 765 queue_t *wq; 766 int len; 767 int mpsize; 768 int refreshes = REFRESHES; 769 int interrupted; 770 int tidu_size; 771 enum clnt_stat status; 772 struct timeval cwait; 773 bool_t delay_first = FALSE; 774 clock_t ticks; 775 776 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum); 777 COTSRCSTAT_INCR(p->cku_stats, rccalls); 778 779 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec); 780 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec); 781 782 /* 783 * Bug ID 1240234: 784 * Look out for zero length timeouts. We don't want to 785 * wait zero seconds for a connection to be established. 786 */ 787 if (wait.tv_sec < clnt_cots_min_conntout) { 788 cwait.tv_sec = clnt_cots_min_conntout; 789 cwait.tv_usec = 0; 790 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,", 791 wait.tv_sec); 792 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout); 793 } else { 794 cwait = wait; 795 } 796 797 call_again: 798 if (cm_entry) { 799 connmgr_release(cm_entry); 800 cm_entry = NULL; 801 } 802 803 mp = NULL; 804 805 /* 806 * If the call is not a retry, allocate a new xid and cache it 807 * for future retries. 808 * Bug ID 1246045: 809 * Treat call as a retry for purposes of binding the source 810 * port only if we actually attempted to send anything on 811 * the previous call. 812 */ 813 if (p->cku_xid == 0) { 814 p->cku_xid = alloc_xid(); 815 /* 816 * We need to ASSERT here that our xid != 0 because this 817 * determines whether or not our call record gets placed on 818 * the hash table or the linked list. By design, we mandate 819 * that RPC calls over cots must have xid's != 0, so we can 820 * ensure proper management of the hash table. 821 */ 822 ASSERT(p->cku_xid != 0); 823 824 retryaddr = NULL; 825 p->cku_flags &= ~CKU_SENT; 826 827 if (p->cku_flags & CKU_ONQUEUE) { 828 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old" 829 " one (%p)\n", (void *)call); 830 call_table_remove(call); 831 p->cku_flags &= ~CKU_ONQUEUE; 832 RPCLOG(64, "clnt_cots_kcallit: removing call from " 833 "dispatch list because xid was zero (now 0x%x)\n", 834 p->cku_xid); 835 } 836 837 if (call->call_reply != NULL) { 838 freemsg(call->call_reply); 839 call->call_reply = NULL; 840 } 841 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) { 842 retryaddr = NULL; 843 844 } else if (p->cku_flags & CKU_SENT) { 845 retryaddr = &p->cku_srcaddr; 846 847 } else { 848 /* 849 * Bug ID 1246045: Nothing was sent, so set retryaddr to 850 * NULL and let connmgr_get() bind to any source port it 851 * can get. 852 */ 853 retryaddr = NULL; 854 } 855 856 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid); 857 RPCLOG(64, " flags = 0x%x\n", p->cku_flags); 858 859 p->cku_err.re_status = RPC_TIMEDOUT; 860 p->cku_err.re_errno = p->cku_err.re_terrno = 0; 861 862 cm_entry = connmgr_wrapget(retryaddr, &cwait, p); 863 864 if (cm_entry == NULL) { 865 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n", 866 clnt_sperrno(p->cku_err.re_status)); 867 868 /* 869 * The reasons why we fail to create a connection are 870 * varied. In most cases we don't want the caller to 871 * immediately retry. This could have one or more 872 * bad effects. This includes flooding the net with 873 * connect requests to ports with no listener; a hard 874 * kernel loop due to all the "reserved" TCP ports being 875 * in use. 876 */ 877 delay_first = TRUE; 878 879 /* 880 * Even if we end up returning EINTR, we still count a 881 * a "can't connect", because the connection manager 882 * might have been committed to waiting for or timing out on 883 * a connection. 884 */ 885 COTSRCSTAT_INCR(p->cku_stats, rccantconn); 886 switch (p->cku_err.re_status) { 887 case RPC_INTR: 888 p->cku_err.re_errno = EINTR; 889 890 /* 891 * No need to delay because a UNIX signal(2) 892 * interrupted us. The caller likely won't 893 * retry the CLNT_CALL() and even if it does, 894 * we assume the caller knows what it is doing. 895 */ 896 delay_first = FALSE; 897 break; 898 899 case RPC_TIMEDOUT: 900 p->cku_err.re_errno = ETIMEDOUT; 901 902 /* 903 * No need to delay because timed out already 904 * on the connection request and assume that the 905 * transport time out is longer than our minimum 906 * timeout, or least not too much smaller. 907 */ 908 delay_first = FALSE; 909 break; 910 911 case RPC_SYSTEMERROR: 912 case RPC_TLIERROR: 913 /* 914 * We want to delay here because a transient 915 * system error has a better chance of going away 916 * if we delay a bit. If it's not transient, then 917 * we don't want end up in a hard kernel loop 918 * due to retries. 919 */ 920 ASSERT(p->cku_err.re_errno != 0); 921 break; 922 923 924 case RPC_CANTCONNECT: 925 /* 926 * RPC_CANTCONNECT is set on T_ERROR_ACK which 927 * implies some error down in the TCP layer or 928 * below. If cku_nodelayonerror is set then we 929 * assume the caller knows not to try too hard. 930 */ 931 RPCLOG0(8, "clnt_cots_kcallit: connection failed,"); 932 RPCLOG0(8, " re_status=RPC_CANTCONNECT,"); 933 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno); 934 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr); 935 if (p->cku_nodelayonerr == TRUE) 936 delay_first = FALSE; 937 938 p->cku_err.re_errno = EIO; 939 940 break; 941 942 case RPC_XPRTFAILED: 943 /* 944 * We want to delay here because we likely 945 * got a refused connection. 946 */ 947 if (p->cku_err.re_errno == 0) 948 p->cku_err.re_errno = EIO; 949 950 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n", 951 p->cku_err.re_errno); 952 953 break; 954 955 default: 956 /* 957 * We delay here because it is better to err 958 * on the side of caution. If we got here then 959 * status could have been RPC_SUCCESS, but we 960 * know that we did not get a connection, so 961 * force the rpc status to RPC_CANTCONNECT. 962 */ 963 p->cku_err.re_status = RPC_CANTCONNECT; 964 p->cku_err.re_errno = EIO; 965 break; 966 } 967 if (delay_first == TRUE) 968 ticks = clnt_cots_min_tout * drv_usectohz(1000000); 969 goto cots_done; 970 } 971 972 /* 973 * If we've never sent any request on this connection (send count 974 * is zero, or the connection has been reset), cache the 975 * the connection's create time and send a request (possibly a retry) 976 */ 977 if ((p->cku_flags & CKU_SENT) == 0 || 978 p->cku_ctime != cm_entry->x_ctime) { 979 p->cku_ctime = cm_entry->x_ctime; 980 981 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) && 982 (call->call_reply != NULL || 983 p->cku_recv_attempts < clnt_cots_maxrecv)) { 984 985 /* 986 * If we've sent a request and our call is on the dispatch 987 * queue and we haven't made too many receive attempts, then 988 * don't re-send, just receive. 989 */ 990 p->cku_recv_attempts++; 991 goto read_again; 992 } 993 994 /* 995 * Now we create the RPC request in a STREAMS message. We have to do 996 * this after the call to connmgr_get so that we have the correct 997 * TIDU size for the transport. 998 */ 999 tidu_size = cm_entry->x_tidu_size; 1000 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE); 1001 1002 while ((mp = allocb(len, BPRI_MED)) == NULL) { 1003 if (strwaitbuf(len, BPRI_MED)) { 1004 p->cku_err.re_status = RPC_SYSTEMERROR; 1005 p->cku_err.re_errno = ENOSR; 1006 COTSRCSTAT_INCR(p->cku_stats, rcnomem); 1007 goto cots_done; 1008 } 1009 } 1010 xdrs = &p->cku_outxdr; 1011 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size); 1012 mpsize = MBLKSIZE(mp); 1013 ASSERT(mpsize >= len); 1014 ASSERT(mp->b_rptr == mp->b_datap->db_base); 1015 1016 /* 1017 * If the size of mblk is not appreciably larger than what we 1018 * asked, then resize the mblk to exactly len bytes. The reason for 1019 * this: suppose len is 1600 bytes, the tidu is 1460 bytes 1020 * (from TCP over ethernet), and the arguments to the RPC require 1021 * 2800 bytes. Ideally we want the protocol to render two 1022 * ~1400 byte segments over the wire. However if allocb() gives us a 2k 1023 * mblk, and we allocate a second mblk for the remainder, the protocol 1024 * module may generate 3 segments over the wire: 1025 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and 1026 * 892 for the third. If we "waste" 448 bytes in the first mblk, 1027 * the XDR encoding will generate two ~1400 byte mblks, and the 1028 * protocol module is more likely to produce properly sized segments. 1029 */ 1030 if ((mpsize >> 1) <= len) 1031 mp->b_rptr += (mpsize - len); 1032 1033 /* 1034 * Adjust b_rptr to reserve space for the non-data protocol headers 1035 * any downstream modules might like to add, and for the 1036 * record marking header. 1037 */ 1038 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE); 1039 1040 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 1041 /* Copy in the preserialized RPC header information. */ 1042 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE); 1043 1044 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */ 1045 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base + 1046 WIRE_HDR_SIZE)); 1047 1048 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE); 1049 1050 /* Serialize the procedure number and the arguments. */ 1051 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 1052 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 1053 (!(*xdr_args)(xdrs, argsp))) { 1054 p->cku_err.re_status = RPC_CANTENCODEARGS; 1055 p->cku_err.re_errno = EIO; 1056 goto cots_done; 1057 } 1058 1059 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid; 1060 } else { 1061 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE]; 1062 IXDR_PUT_U_INT32(uproc, procnum); 1063 1064 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 1065 1066 /* Use XDR_SETPOS() to set the b_wptr. */ 1067 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base)); 1068 1069 /* Serialize the procedure number and the arguments. */ 1070 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4, 1071 xdrs, xdr_args, argsp)) { 1072 p->cku_err.re_status = RPC_CANTENCODEARGS; 1073 p->cku_err.re_errno = EIO; 1074 goto cots_done; 1075 } 1076 } 1077 1078 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n", 1079 tidu_size); 1080 1081 wq = cm_entry->x_wq; 1082 clnt_dispatch_send(wq, mp, call, p->cku_xid, 1083 (p->cku_flags & CKU_ONQUEUE)); 1084 1085 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n", 1086 (uint_t)p->cku_xid); 1087 p->cku_flags = (CKU_ONQUEUE|CKU_SENT); 1088 p->cku_recv_attempts = 1; 1089 1090 #ifdef RPCDEBUG 1091 time_sent = lbolt; 1092 #endif 1093 1094 /* 1095 * Wait for a reply or a timeout. If there is no error or timeout, 1096 * (both indicated by call_status), call->call_reply will contain 1097 * the RPC reply message. 1098 */ 1099 read_again: 1100 mutex_enter(&call->call_lock); 1101 interrupted = 0; 1102 if (call->call_status == RPC_TIMEDOUT) { 1103 /* 1104 * Indicate that the lwp is not to be stopped while waiting 1105 * for this network traffic. This is to avoid deadlock while 1106 * debugging a process via /proc and also to avoid recursive 1107 * mutex_enter()s due to NFS page faults while stopping 1108 * (NFS holds locks when it calls here). 1109 */ 1110 clock_