Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/stream.h>
     28 #include <sys/stropts.h>
     29 #include <sys/errno.h>
     30 #include <sys/strlog.h>
     31 #include <sys/tihdr.h>
     32 #include <sys/socket.h>
     33 #include <sys/ddi.h>
     34 #include <sys/sunddi.h>
     35 #include <sys/kmem.h>
     36 #include <sys/zone.h>
     37 #include <sys/sysmacros.h>
     38 #include <sys/cmn_err.h>
     39 #include <sys/vtrace.h>
     40 #include <sys/debug.h>
     41 #include <sys/atomic.h>
     42 #include <sys/strsun.h>
     43 #include <sys/random.h>
     44 #include <netinet/in.h>
     45 #include <net/if.h>
     46 #include <netinet/ip6.h>
     47 #include <net/pfkeyv2.h>
     48 
     49 #include <inet/common.h>
     50 #include <inet/mi.h>
     51 #include <inet/nd.h>
     52 #include <inet/ip.h>
     53 #include <inet/ip_impl.h>
     54 #include <inet/ip6.h>
     55 #include <inet/sadb.h>
     56 #include <inet/ipsec_info.h>
     57 #include <inet/ipsec_impl.h>
     58 #include <inet/ipsecesp.h>
     59 #include <inet/ipdrop.h>
     60 #include <inet/tcp.h>
     61 #include <sys/kstat.h>
     62 #include <sys/policy.h>
     63 #include <sys/strsun.h>
     64 #include <inet/udp_impl.h>
     65 #include <sys/taskq.h>
     66 #include <sys/note.h>
     67 
     68 #include <sys/iphada.h>
     69 
     70 /*
     71  * Table of ND variables supported by ipsecesp. These are loaded into
     72  * ipsecesp_g_nd in ipsecesp_init_nd.
     73  * All of these are alterable, within the min/max values given, at run time.
     74  */
     75 static	ipsecespparam_t	lcl_param_arr[] = {
     76 	/* min	max			value	name */
     77 	{ 0,	3,			0,	"ipsecesp_debug"},
     78 	{ 125,	32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
     79 	{ 1,	10,			1,	"ipsecesp_reap_delay"},
     80 	{ 1,	SADB_MAX_REPLAY,	64,	"ipsecesp_replay_size"},
     81 	{ 1,	300,			15,	"ipsecesp_acquire_timeout"},
     82 	{ 1,	1800,			90,	"ipsecesp_larval_timeout"},
     83 	/* Default lifetime values for ACQUIRE messages. */
     84 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_bytes"},
     85 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_bytes"},
     86 	{ 0,	0xffffffffU,	24000,	"ipsecesp_default_soft_addtime"},
     87 	{ 0,	0xffffffffU,	28800,	"ipsecesp_default_hard_addtime"},
     88 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_soft_usetime"},
     89 	{ 0,	0xffffffffU,	0,	"ipsecesp_default_hard_usetime"},
     90 	{ 0,	1,		0,	"ipsecesp_log_unknown_spi"},
     91 	{ 0,	2,		1,	"ipsecesp_padding_check"},
     92 	{ 0,	600,		20,	"ipsecesp_nat_keepalive_interval"},
     93 };
     94 #define	ipsecesp_debug	ipsecesp_params[0].ipsecesp_param_value
     95 #define	ipsecesp_age_interval ipsecesp_params[1].ipsecesp_param_value
     96 #define	ipsecesp_age_int_max	ipsecesp_params[1].ipsecesp_param_max
     97 #define	ipsecesp_reap_delay	ipsecesp_params[2].ipsecesp_param_value
     98 #define	ipsecesp_replay_size	ipsecesp_params[3].ipsecesp_param_value
     99 #define	ipsecesp_acquire_timeout	\
    100 	ipsecesp_params[4].ipsecesp_param_value
    101 #define	ipsecesp_larval_timeout	\
    102 	ipsecesp_params[5].ipsecesp_param_value
    103 #define	ipsecesp_default_soft_bytes	\
    104 	ipsecesp_params[6].ipsecesp_param_value
    105 #define	ipsecesp_default_hard_bytes	\
    106 	ipsecesp_params[7].ipsecesp_param_value
    107 #define	ipsecesp_default_soft_addtime	\
    108 	ipsecesp_params[8].ipsecesp_param_value
    109 #define	ipsecesp_default_hard_addtime	\
    110 	ipsecesp_params[9].ipsecesp_param_value
    111 #define	ipsecesp_default_soft_usetime	\
    112 	ipsecesp_params[10].ipsecesp_param_value
    113 #define	ipsecesp_default_hard_usetime	\
    114 	ipsecesp_params[11].ipsecesp_param_value
    115 #define	ipsecesp_log_unknown_spi	\
    116 	ipsecesp_params[12].ipsecesp_param_value
    117 #define	ipsecesp_padding_check	\
    118 	ipsecesp_params[13].ipsecesp_param_value
    119 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
    120 
    121 #define	esp0dbg(a)	printf a
    122 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
    123 #define	esp1dbg(espstack, a)	if (espstack->ipsecesp_debug != 0) printf a
    124 #define	esp2dbg(espstack, a)	if (espstack->ipsecesp_debug > 1) printf a
    125 #define	esp3dbg(espstack, a)	if (espstack->ipsecesp_debug > 2) printf a
    126 
    127 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
    128 static int ipsecesp_close(queue_t *);
    129 static void ipsecesp_rput(queue_t *, mblk_t *);
    130 static void ipsecesp_wput(queue_t *, mblk_t *);
    131 static void	*ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
    132 static void	ipsecesp_stack_fini(netstackid_t stackid, void *arg);
    133 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
    134 
    135 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
    136 static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t);
    137 static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *,
    138     boolean_t, ipsa_t *);
    139 
    140 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
    141     ipsecesp_stack_t *);
    142 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
    143     kstat_named_t **, ipsecesp_stack_t *);
    144 static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t);
    145 static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *,
    146     uint_t);
    147 /* Setable in /etc/system */
    148 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
    149 
    150 static struct module_info info = {
    151 	5137, "ipsecesp", 0, INFPSZ, 65536, 1024
    152 };
    153 
    154 static struct qinit rinit = {
    155 	(pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
    156 	NULL
    157 };
    158 
    159 static struct qinit winit = {
    160 	(pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
    161 	NULL
    162 };
    163 
    164 struct streamtab ipsecespinfo = {
    165 	&rinit, &winit, NULL, NULL
    166 };
    167 
    168 static taskq_t *esp_taskq;
    169 
    170 /*
    171  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
    172  *
    173  * Question:	Do I need this, given that all instance's esps->esps_wq point
    174  *		to IP?
    175  *
    176  * Answer:	Yes, because I need to know which queue is BOUND to
    177  *		IPPROTO_ESP
    178  */
    179 
    180 /*
    181  * Stats.  This may eventually become a full-blown SNMP MIB once that spec
    182  * stabilizes.
    183  */
    184 
    185 typedef struct esp_kstats_s {
    186 	kstat_named_t esp_stat_num_aalgs;
    187 	kstat_named_t esp_stat_good_auth;
    188 	kstat_named_t esp_stat_bad_auth;
    189 	kstat_named_t esp_stat_bad_padding;
    190 	kstat_named_t esp_stat_replay_failures;
    191 	kstat_named_t esp_stat_replay_early_failures;
    192 	kstat_named_t esp_stat_keysock_in;
    193 	kstat_named_t esp_stat_out_requests;
    194 	kstat_named_t esp_stat_acquire_requests;
    195 	kstat_named_t esp_stat_bytes_expired;
    196 	kstat_named_t esp_stat_out_discards;
    197 	kstat_named_t esp_stat_in_accelerated;
    198 	kstat_named_t esp_stat_out_accelerated;
    199 	kstat_named_t esp_stat_noaccel;
    200 	kstat_named_t esp_stat_crypto_sync;
    201 	kstat_named_t esp_stat_crypto_async;
    202 	kstat_named_t esp_stat_crypto_failures;
    203 	kstat_named_t esp_stat_num_ealgs;
    204 	kstat_named_t esp_stat_bad_decrypt;
    205 	kstat_named_t esp_stat_sa_port_renumbers;
    206 } esp_kstats_t;
    207 
    208 /*
    209  * espstack->esp_kstats is equal to espstack->esp_ksp->ks_data if
    210  * kstat_create_netstack for espstack->esp_ksp succeeds, but when it
    211  * fails, it will be NULL. Note this is done for all stack instances,
    212  * so it *could* fail. hence a non-NULL checking is done for
    213  * ESP_BUMP_STAT and ESP_DEBUMP_STAT
    214  */
    215 #define	ESP_BUMP_STAT(espstack, x)					\
    216 do {									\
    217 	if (espstack->esp_kstats != NULL)				\
    218 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64++;	\
    219 _NOTE(CONSTCOND)							\
    220 } while (0)
    221 
    222 #define	ESP_DEBUMP_STAT(espstack, x)					\
    223 do {									\
    224 	if (espstack->esp_kstats != NULL)				\
    225 		(espstack->esp_kstats->esp_stat_ ## x).value.ui64--;	\
    226 _NOTE(CONSTCOND)							\
    227 } while (0)
    228 
    229 static int	esp_kstat_update(kstat_t *, int);
    230 
    231 static boolean_t
    232 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
    233 {
    234 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
    235 	    "net", KSTAT_TYPE_NAMED,
    236 	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
    237 	    KSTAT_FLAG_PERSISTENT, stackid);
    238 
    239 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
    240 		return (B_FALSE);
    241 
    242 	espstack->esp_kstats = espstack->esp_ksp->ks_data;
    243 
    244 	espstack->esp_ksp->ks_update = esp_kstat_update;
    245 	espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
    246 
    247 #define	K64 KSTAT_DATA_UINT64
    248 #define	KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
    249 
    250 	KI(num_aalgs);
    251 	KI(num_ealgs);
    252 	KI(good_auth);
    253 	KI(bad_auth);
    254 	KI(bad_padding);
    255 	KI(replay_failures);
    256 	KI(replay_early_failures);
    257 	KI(keysock_in);
    258 	KI(out_requests);
    259 	KI(acquire_requests);
    260 	KI(bytes_expired);
    261 	KI(out_discards);
    262 	KI(in_accelerated);
    263 	KI(out_accelerated);
    264 	KI(noaccel);
    265 	KI(crypto_sync);
    266 	KI(crypto_async);
    267 	KI(crypto_failures);
    268 	KI(bad_decrypt);
    269 	KI(sa_port_renumbers);
    270 
    271 #undef KI
    272 #undef K64
    273 
    274 	kstat_install(espstack->esp_ksp);
    275 
    276 	return (B_TRUE);
    277 }
    278 
    279 static int
    280 esp_kstat_update(kstat_t *kp, int rw)
    281 {
    282 	esp_kstats_t *ekp;
    283 	netstackid_t	stackid = (zoneid_t)(uintptr_t)kp->ks_private;
    284 	netstack_t	*ns;
    285 	ipsec_stack_t	*ipss;
    286 
    287 	if ((kp == NULL) || (kp->ks_data == NULL))
    288 		return (EIO);
    289 
    290 	if (rw == KSTAT_WRITE)
    291 		return (EACCES);
    292 
    293 	ns = netstack_find_by_stackid(stackid);
    294 	if (ns == NULL)
    295 		return (-1);
    296 	ipss = ns->netstack_ipsec;
    297 	if (ipss == NULL) {
    298 		netstack_rele(ns);
    299 		return (-1);
    300 	}
    301 	ekp = (esp_kstats_t *)kp->ks_data;
    302 
    303 	mutex_enter(&ipss->ipsec_alg_lock);
    304 	ekp->esp_stat_num_aalgs.value.ui64 =
    305 	    ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
    306 	ekp->esp_stat_num_ealgs.value.ui64 =
    307 	    ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
    308 	mutex_exit(&ipss->ipsec_alg_lock);
    309 
    310 	netstack_rele(ns);
    311 	return (0);
    312 }
    313 
    314 #ifdef DEBUG
    315 /*
    316  * Debug routine, useful to see pre-encryption data.
    317  */
    318 static char *
    319 dump_msg(mblk_t *mp)
    320 {
    321 	char tmp_str[3], tmp_line[256];
    322 
    323 	while (mp != NULL) {
    324 		unsigned char *ptr;
    325 
    326 		printf("mblk address 0x%p, length %ld, db_ref %d "
    327 		    "type %d, base 0x%p, lim 0x%p\n",
    328 		    (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
    329 		    mp->b_datap->db_ref, mp->b_datap->db_type,
    330 		    (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
    331 		ptr = mp->b_rptr;
    332 
    333 		tmp_line[0] = '\0';
    334 		while (ptr < mp->b_wptr) {
    335 			uint_t diff;
    336 
    337 			diff = (ptr - mp->b_rptr);
    338 			if (!(diff & 0x1f)) {
    339 				if (strlen(tmp_line) > 0) {
    340 					printf("bytes: %s\n", tmp_line);
    341 					tmp_line[0] = '\0';
    342 				}
    343 			}
    344 			if (!(diff & 0x3))
    345 				(void) strcat(tmp_line, " ");
    346 			(void) sprintf(tmp_str, "%02x", *ptr);
    347 			(void) strcat(tmp_line, tmp_str);
    348 			ptr++;
    349 		}
    350 		if (strlen(tmp_line) > 0)
    351 			printf("bytes: %s\n", tmp_line);
    352 
    353 		mp = mp->b_cont;
    354 	}
    355 
    356 	return ("\n");
    357 }
    358 
    359 #else /* DEBUG */
    360 static char *
    361 dump_msg(mblk_t *mp)
    362 {
    363 	printf("Find value of mp %p.\n", mp);
    364 	return ("\n");
    365 }
    366 #endif /* DEBUG */
    367 
    368 /*
    369  * Don't have to lock age_interval, as only one thread will access it at
    370  * a time, because I control the one function that does with timeout().
    371  */
    372 static void
    373 esp_ager(void *arg)
    374 {
    375 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
    376 	netstack_t	*ns = espstack->ipsecesp_netstack;
    377 	hrtime_t begin = gethrtime();
    378 
    379 	sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
    380 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
    381 	sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
    382 	    espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
    383 
    384 	espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
    385 	    esp_ager, espstack,
    386 	    &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
    387 	    info.mi_idnum);
    388 }
    389 
    390 /*
    391  * Get an ESP NDD parameter.
    392  */
    393 /* ARGSUSED */
    394 static int
    395 ipsecesp_param_get(q, mp, cp, cr)
    396 	queue_t	*q;
    397 	mblk_t	*mp;
    398 	caddr_t	cp;
    399 	cred_t *cr;
    400 {
    401 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
    402 	uint_t value;
    403 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
    404 
    405 	mutex_enter(&espstack->ipsecesp_param_lock);
    406 	value = ipsecesppa->ipsecesp_param_value;
    407 	mutex_exit(&espstack->ipsecesp_param_lock);
    408 
    409 	(void) mi_mpprintf(mp, "%u", value);
    410 	return (0);
    411 }
    412 
    413 /*
    414  * This routine sets an NDD variable in a ipsecespparam_t structure.
    415  */
    416 /* ARGSUSED */
    417 static int
    418 ipsecesp_param_set(q, mp, value, cp, cr)
    419 	queue_t	*q;
    420 	mblk_t	*mp;
    421 	char	*value;
    422 	caddr_t	cp;
    423 	cred_t *cr;
    424 {
    425 	ulong_t	new_value;
    426 	ipsecespparam_t	*ipsecesppa = (ipsecespparam_t *)cp;
    427 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
    428 
    429 	/*
    430 	 * Fail the request if the new value does not lie within the
    431 	 * required bounds.
    432 	 */
    433 	if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
    434 	    new_value < ipsecesppa->ipsecesp_param_min ||
    435 	    new_value > ipsecesppa->ipsecesp_param_max) {
    436 		return (EINVAL);
    437 	}
    438 
    439 	/* Set the new value */
    440 	mutex_enter(&espstack->ipsecesp_param_lock);
    441 	ipsecesppa->ipsecesp_param_value = new_value;
    442 	mutex_exit(&espstack->ipsecesp_param_lock);
    443 	return (0);
    444 }
    445 
    446 /*
    447  * Using lifetime NDD variables, fill in an extended combination's
    448  * lifetime information.
    449  */
    450 void
    451 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
    452 {
    453 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
    454 
    455 	ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
    456 	ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
    457 	ecomb->sadb_x_ecomb_soft_addtime =
    458 	    espstack->ipsecesp_default_soft_addtime;
    459 	ecomb->sadb_x_ecomb_hard_addtime =
    460 	    espstack->ipsecesp_default_hard_addtime;
    461 	ecomb->sadb_x_ecomb_soft_usetime =
    462 	    espstack->ipsecesp_default_soft_usetime;
    463 	ecomb->sadb_x_ecomb_hard_usetime =
    464 	    espstack->ipsecesp_default_hard_usetime;
    465 }
    466 
    467 /*
    468  * Initialize things for ESP at module load time.
    469  */
    470 boolean_t
    471 ipsecesp_ddi_init(void)
    472 {
    473 	esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
    474 	    IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
    475 
    476 	/*
    477 	 * We want to be informed each time a stack is created or
    478 	 * destroyed in the kernel, so we can maintain the
    479 	 * set of ipsecesp_stack_t's.
    480 	 */
    481 	netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
    482 	    ipsecesp_stack_fini);
    483 
    484 	return (B_TRUE);
    485 }
    486 
    487 /*
    488  * Walk through the param array specified registering each element with the
    489  * named dispatch handler.
    490  */
    491 static boolean_t
    492 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
    493 {
    494 	for (; cnt-- > 0; espp++) {
    495 		if (espp->ipsecesp_param_name != NULL &&
    496 		    espp->ipsecesp_param_name[0]) {
    497 			if (!nd_load(ndp,
    498 			    espp->ipsecesp_param_name,
    499 			    ipsecesp_param_get, ipsecesp_param_set,
    500 			    (caddr_t)espp)) {
    501 				nd_free(ndp);
    502 				return (B_FALSE);
    503 			}
    504 		}
    505 	}
    506 	return (B_TRUE);
    507 }
    508 /*
    509  * Initialize things for ESP for each stack instance
    510  */
    511 static void *
    512 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
    513 {
    514 	ipsecesp_stack_t	*espstack;
    515 	ipsecespparam_t		*espp;
    516 
    517 	espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
    518 	    KM_SLEEP);
    519 	espstack->ipsecesp_netstack = ns;
    520 
    521 	espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
    522 	espstack->ipsecesp_params = espp;
    523 	bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
    524 
    525 	(void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
    526 	    A_CNT(lcl_param_arr));
    527 
    528 	(void) esp_kstat_init(espstack, stackid);
    529 
    530 	espstack->esp_sadb.s_acquire_timeout =
    531 	    &espstack->ipsecesp_acquire_timeout;
    532 	espstack->esp_sadb.s_acqfn = esp_send_acquire;
    533 	sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
    534 	    espstack->ipsecesp_netstack);
    535 
    536 	mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
    537 
    538 	ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
    539 	return (espstack);
    540 }
    541 
    542 /*
    543  * Destroy things for ESP at module unload time.
    544  */
    545 void
    546 ipsecesp_ddi_destroy(void)
    547 {
    548 	netstack_unregister(NS_IPSECESP);
    549 	taskq_destroy(esp_taskq);
    550 }
    551 
    552 /*
    553  * Destroy things for ESP for one stack instance
    554  */
    555 static void
    556 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
    557 {
    558 	ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
    559 
    560 	if (espstack->esp_pfkey_q != NULL) {
    561 		(void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
    562 	}
    563 	espstack->esp_sadb.s_acqfn = NULL;
    564 	espstack->esp_sadb.s_acquire_timeout = NULL;
    565 	sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
    566 	ip_drop_unregister(&espstack->esp_dropper);
    567 	mutex_destroy(&espstack->ipsecesp_param_lock);
    568 	nd_free(&espstack->ipsecesp_g_nd);
    569 
    570 	kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
    571 	espstack->ipsecesp_params = NULL;
    572 	kstat_delete_netstack(espstack->esp_ksp, stackid);
    573 	espstack->esp_ksp = NULL;
    574 	espstack->esp_kstats = NULL;
    575 	kmem_free(espstack, sizeof (*espstack));
    576 }
    577 
    578 /*
    579  * ESP module open routine.
    580  */
    581 /* ARGSUSED */
    582 static int
    583 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
    584 {
    585 	netstack_t		*ns;
    586 	ipsecesp_stack_t	*espstack;
    587 
    588 	if (secpolicy_ip_config(credp, B_FALSE) != 0)
    589 		return (EPERM);
    590 
    591 	if (q->q_ptr != NULL)
    592 		return (0);  /* Re-open of an already open instance. */
    593 
    594 	if (sflag != MODOPEN)
    595 		return (EINVAL);
    596 
    597 	ns = netstack_find_by_cred(credp);
    598 	ASSERT(ns != NULL);
    599 	espstack = ns->netstack_ipsecesp;
    600 	ASSERT(espstack != NULL);
    601 
    602 	/*
    603 	 * ASSUMPTIONS (because I'm MT_OCEXCL):
    604 	 *
    605 	 *	* I'm being pushed on top of IP for all my opens (incl. #1).
    606 	 *	* Only ipsecesp_open() can write into esp_sadb.s_ip_q.
    607 	 *	* Because of this, I can check lazily for esp_sadb.s_ip_q.
    608 	 *
    609 	 *  If these assumptions are wrong, I'm in BIG trouble...
    610 	 */
    611 
    612 	q->q_ptr = espstack;
    613 	WR(q)->q_ptr = q->q_ptr;
    614 
    615 	if (espstack->esp_sadb.s_ip_q == NULL) {
    616 		struct T_unbind_req *tur;
    617 
    618 		espstack->esp_sadb.s_ip_q = WR(q);
    619 		/* Allocate an unbind... */
    620 		espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req),
    621 		    BPRI_HI);
    622 
    623 		/*
    624 		 * Send down T_BIND_REQ to bind IPPROTO_ESP.
    625 		 * Handle the ACK here in ESP.
    626 		 */
    627 		qprocson(q);
    628 		if (espstack->esp_ip_unbind == NULL ||
    629 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
    630 			if (espstack->esp_ip_unbind != NULL) {
    631 				freeb(espstack->esp_ip_unbind);
    632 				espstack->esp_ip_unbind = NULL;
    633 			}
    634 			q->q_ptr = NULL;
    635 			netstack_rele(espstack->ipsecesp_netstack);
    636 			return (ENOMEM);
    637 		}
    638 
    639 		espstack->esp_ip_unbind->b_datap->db_type = M_PROTO;
    640 		tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr;
    641 		tur->PRIM_type = T_UNBIND_REQ;
    642 	} else {
    643 		qprocson(q);
    644 	}
    645 
    646 	/*
    647 	 * For now, there's not much I can do.  I'll be getting a message
    648 	 * passed down to me from keysock (in my wput), and a T_BIND_ACK
    649 	 * up from IP (in my rput).
    650 	 */
    651 
    652 	return (0);
    653 }
    654 
    655 /*
    656  * ESP module close routine.
    657  */
    658 static int
    659 ipsecesp_close(queue_t *q)
    660 {
    661 	ipsecesp_stack_t	*espstack = (ipsecesp_stack_t *)q->q_ptr;
    662 
    663 	/*
    664 	 * If esp_sadb.s_ip_q is attached to this instance, send a
    665 	 * T_UNBIND_REQ to IP for the instance before doing
    666 	 * a qprocsoff().
    667 	 */
    668 	if (WR(q) == espstack->esp_sadb.s_ip_q &&
    669 	    espstack->esp_ip_unbind != NULL) {
    670 		putnext(WR(q), espstack->esp_ip_unbind);
    671 		espstack->esp_ip_unbind = NULL;
    672 	}
    673 
    674 	/*
    675 	 * Clean up q_ptr, if needed.
    676 	 */
    677 	qprocsoff(q);
    678 
    679 	/* Keysock queue check is safe, because of OCEXCL perimeter. */
    680 
    681 	if (q == espstack->esp_pfkey_q) {
    682 		esp1dbg(espstack,
    683 		    ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
    684 		espstack->esp_pfkey_q = NULL;
    685 		/* Detach qtimeouts. */
    686 		(void) quntimeout(q, espstack->esp_event);
    687 	}
    688 
    689 	if (WR(q) == espstack->esp_sadb.s_ip_q) {
    690 		/*
    691 		 * If the esp_sadb.s_ip_q is attached to this instance, find
    692 		 * another.  The OCEXCL outer perimeter helps us here.
    693 		 */
    694 		espstack->esp_sadb.s_ip_q = NULL;
    695 
    696 		/*
    697 		 * Find a replacement queue for esp_sadb.s_ip_q.
    698 		 */
    699 		if (espstack->esp_pfkey_q != NULL &&
    700 		    espstack->esp_pfkey_q != RD(q)) {
    701 			/*
    702 			 * See if we can use the pfkey_q.
    703 			 */
    704 			espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q);
    705 		}
    706 
    707 		if (espstack->esp_sadb.s_ip_q == NULL ||
    708 		    !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
    709 			esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n"));
    710 			espstack->esp_sadb.s_ip_q = NULL;
    711 		} else {
    712 			espstack->esp_ip_unbind =
    713 			    allocb(sizeof (struct T_unbind_req), BPRI_HI);
    714 
    715 			if (espstack->esp_ip_unbind != NULL) {
    716 				struct T_unbind_req *tur;
    717 
    718 				espstack->esp_ip_unbind->b_datap->db_type =
    719 				    M_PROTO;
    720 				tur = (struct T_unbind_req *)
    721 				    espstack->esp_ip_unbind->b_rptr;
    722 				tur->PRIM_type = T_UNBIND_REQ;
    723 			}
    724 			/* If it's NULL, I can't do much here. */
    725 		}
    726 	}
    727 
    728 	netstack_rele(espstack->ipsecesp_netstack);
    729 	return (0);
    730 }
    731 
    732 /*
    733  * Add a number of bytes to what the SA has protected so far.  Return
    734  * B_TRUE if the SA can still protect that many bytes.
    735  *
    736  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
    737  * any obtained peer SA.
    738  */
    739 static boolean_t
    740 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
    741 {
    742 	ipsa_t *inassoc, *outassoc;
    743 	isaf_t *bucket;
    744 	boolean_t inrc, outrc, isv6;
    745 	sadb_t *sp;
    746 	int outhash;
    747 	netstack_t		*ns = assoc->ipsa_netstack;
    748 	ipsecesp_stack_t	*espstack = ns->netstack_ipsecesp;
    749 
    750 	/* No peer?  No problem! */
    751 	if (!assoc->ipsa_haspeer) {
    752 		return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
    753 		    B_TRUE));
    754 	}
    755 
    756 	/*
    757 	 * Otherwise, we want to grab both the original assoc and its peer.
    758 	 * There might be a race for this, but if it's a real race, two
    759 	 * expire messages may occur.  We limit this by only sending the
    760 	 * expire message on one of the peers, we'll pick the inbound
    761 	 * arbitrarily.
    762 	 *
    763 	 * If we need tight synchronization on the peer SA, then we need to
    764 	 * reconsider.
    765 	 */
    766 
    767 	/* Use address length to select IPv6/IPv4 */
    768 	isv6 = (assoc->ipsa_addrfam == AF_INET6);
    769 	sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
    770 
    771 	if (inbound) {
    772 		inassoc = assoc;
    773 		if (isv6) {
    774 			outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
    775 			    &inassoc->ipsa_dstaddr));
    776 		} else {
    777 			outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
    778 			    &inassoc->ipsa_dstaddr));
    779 		}
    780 		bucket = &sp->sdb_of[outhash];
    781 		mutex_enter(&bucket->isaf_lock);
    782 		outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
    783 		    inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
    784 		    inassoc->ipsa_addrfam);
    785 		mutex_exit(&bucket->isaf_lock);
    786 		if (outassoc == NULL) {
    787 			/* Q: Do we wish to set haspeer == B_FALSE? */
    788 			esp0dbg(("esp_age_bytes: "
    789 			    "can't find peer for inbound.\n"));
    790 			return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
    791 			    bytes, B_TRUE));
    792 		}
    793 	} else {
    794 		outassoc = assoc;
    795 		bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
    796 		mutex_enter(&bucket->isaf_lock);
    797 		inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
    798 		    outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
    799 		    outassoc->ipsa_addrfam);
    800 		mutex_exit(&bucket->isaf_lock);
    801 		if (inassoc == NULL) {
    802 			/* Q: Do we wish to set haspeer == B_FALSE? */
    803 			esp0dbg(("esp_age_bytes: "
    804 			    "can't find peer for outbound.\n"));
    805 			return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
    806 			    bytes, B_TRUE));
    807 		}
    808 	}
    809 
    810 	inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
    811 	outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
    812 
    813 	/*
    814 	 * REFRELE any peer SA.
    815 	 *
    816 	 * Because of the multi-line macro nature of IPSA_REFRELE, keep
    817 	 * them in { }.
    818 	 */
    819 	if (inbound) {
    820 		IPSA_REFRELE(outassoc);
    821 	} else {
    822 		IPSA_REFRELE(inassoc);
    823 	}
    824 
    825 	return (inrc && outrc);
    826 }
    827 
    828 /*
    829  * Do incoming NAT-T manipulations for packet.
    830  */
    831 static ipsec_status_t
    832 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
    833 {
    834 	ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
    835 	tcpha_t *tcph;
    836 	udpha_t *udpha;
    837 	/* Initialize to our inbound cksum adjustment... */
    838 	uint32_t sum = assoc->ipsa_inbound_cksum;
    839 
    840 	switch (ipha->ipha_protocol) {
    841 	case IPPROTO_TCP:
    842 		tcph = (tcpha_t *)(data_mp->b_rptr +
    843 		    IPH_HDR_LENGTH(ipha));
    844 
    845 #define	DOWN_SUM(x) (x) = ((x) & 0xFFFF) +	 ((x) >> 16)
    846 		sum += ~ntohs(tcph->tha_sum) & 0xFFFF;
    847 		DOWN_SUM(sum);
    848 		DOWN_SUM(sum);
    849 		tcph->tha_sum = ~htons(sum);
    850 		break;
    851 	case IPPROTO_UDP:
    852 		udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
    853 
    854 		if (udpha->uha_checksum != 0) {
    855 			/* Adujst if the inbound one was not zero. */
    856 			sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
    857 			DOWN_SUM(sum);
    858 			DOWN_SUM(sum);
    859 			udpha->uha_checksum = ~htons(sum);
    860 			if (udpha->uha_checksum == 0)
    861 				udpha->uha_checksum = 0xFFFF;
    862 		}
    863 #undef DOWN_SUM
    864 		break;
    865 	case IPPROTO_IP:
    866 		/*
    867 		 * This case is only an issue for self-encapsulated
    868 		 * packets.  So for now, fall through.
    869 		 */
    870 		break;
    871 	}
    872 	return (IPSEC_STATUS_SUCCESS);
    873 }
    874 
    875 
    876 /*
    877  * Strip ESP header, check padding, and fix IP header.
    878  * Returns B_TRUE on success, B_FALSE if an error occured.
    879  */
    880 static boolean_t
    881 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
    882     kstat_named_t **counter, ipsecesp_stack_t *espstack)
    883 {
    884 	ipha_t *ipha;
    885 	ip6_t *ip6h;
    886 	uint_t divpoint;
    887 	mblk_t *scratch;
    888 	uint8_t nexthdr, padlen;
    889 	uint8_t lastpad;
    890 	ipsec_stack_t	*ipss = espstack->ipsecesp_netstack->netstack_ipsec;
    891 	uint8_t *lastbyte;
    892 
    893 	/*
    894 	 * Strip ESP data and fix IP header.
    895 	 *
    896 	 * XXX In case the beginning of esp_inbound() changes to not do a
    897 	 * pullup, this part of the code can remain unchanged.
    898 	 */
    899 	if (isv4) {
    900 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
    901 		ipha = (ipha_t *)data_mp->b_rptr;
    902 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
    903 		    IPH_HDR_LENGTH(ipha));
    904 		divpoint = IPH_HDR_LENGTH(ipha);
    905 	} else {
    906 		ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
    907 		ip6h = (ip6_t *)data_mp->b_rptr;
    908 		divpoint = ip_hdr_length_v6(data_mp, ip6h);
    909 	}
    910 
    911 	scratch = data_mp;
    912 	while (scratch->b_cont != NULL)
    913 		scratch = scratch->b_cont;
    914 
    915 	ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
    916 
    917 	/*
    918 	 * "Next header" and padding length are the last two bytes in the
    919 	 * ESP-protected datagram, thus the explicit - 1 and - 2.
    920 	 * lastpad is the last byte of the padding, which can be used for
    921 	 * a quick check to see if the padding is correct.
    922 	 */
    923 	lastbyte = scratch->b_wptr - 1;
    924 	nexthdr = *lastbyte--;
    925 	padlen = *lastbyte--;
    926 
    927 	if (isv4) {
    928 		/* Fix part of the IP header. */
    929 		ipha->ipha_protocol = nexthdr;
    930 		/*
    931 		 * Reality check the padlen.  The explicit - 2 is for the
    932 		 * padding length and the next-header bytes.
    933 		 */
    934 		if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
    935 		    sizeof (esph_t) - ivlen) {
    936 			ESP_BUMP_STAT(espstack, bad_decrypt);
    937 			ipsec_rl_strlog(espstack->ipsecesp_netstack,
    938 			    info.mi_idnum, 0, 0,
    939 			    SL_ERROR | SL_WARN,
    940 			    "Corrupt ESP packet (padlen too big).\n");
    941 			esp1dbg(espstack, ("padlen (%d) is greater than:\n",
    942 			    padlen));
    943 			esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
    944 			    "hdr - ivlen(%d) = %d.\n",
    945 			    ntohs(ipha->ipha_length), ivlen,
    946 			    (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
    947 			    2 - sizeof (esph_t) - ivlen)));
    948 			*counter = DROPPER(ipss, ipds_esp_bad_padlen);
    949 			return (B_FALSE);
    950 		}
    951 
    952 		/*
    953 		 * Fix the rest of the header.  The explicit - 2 is for the
    954 		 * padding length and the next-header bytes.
    955 		 */
    956 		ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
    957 		    2 - sizeof (esph_t) - ivlen);
    958 		ipha->ipha_hdr_checksum = 0;
    959 		ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
    960 	} else {
    961 		if (ip6h->ip6_nxt == IPPROTO_ESP) {
    962 			ip6h->ip6_nxt = nexthdr;
    963 		} else {
    964 			ip6_pkt_t ipp;
    965 
    966 			bzero(&ipp, sizeof (ipp));
    967 			(void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
    968 			if (ipp.ipp_dstopts != NULL) {
    969 				ipp.ipp_dstopts->ip6d_nxt = nexthdr;
    970 			} else if (ipp.ipp_rthdr != NULL) {
    971 				ipp.ipp_rthdr->ip6r_nxt = nexthdr;
    972 			} else if (ipp.ipp_hopopts != NULL) {
    973 				ipp.ipp_hopopts->ip6h_nxt = nexthdr;
    974 			} else {
    975 				/* Panic a DEBUG kernel. */
    976 				ASSERT(ipp.ipp_hopopts != NULL);
    977 				/* Otherwise, pretend it's IP + ESP. */
    978