Home | History | Annotate | Download | only in ip
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * IP interface to squeues.
     30  *
     31  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
     32  * cpu_squeue field of the cpu structure. Each squeue is associated with a
     33  * connection instance (conn_t).
     34  *
     35  * For CPUs available at system startup time the squeue creation and association
     36  * with CPU happens at MP initialization time. For CPUs added during dynamic
     37  * reconfiguration, the initialization happens when the new CPU is configured in
     38  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
     39  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
     40  * variable.
     41  *
     42  * There are two modes of associating connection with squeues. The first mode
     43  * associates each connection with the CPU that creates the connection (either
     44  * during open time or during accept time). The second mode associates each
     45  * connection with a random CPU, effectively distributing load over all CPUs
     46  * and all squeues in the system. The mode is controlled by the
     47  * ip_squeue_fanout variable.
     48  *
     49  * NOTE: The fact that there is an association between each connection and
     50  * squeue and squeue and CPU does not mean that each connection is always
     51  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
     52  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
     53  * binding is only relevant for the worker thread.
     54  *
     55  * The list of all created squeues is kept in squeue_set structure. This list is
     56  * used when ip_squeue_fanout is set and the load is distributed across all
     57  * squeues.
     58  *
     59  * INTERFACE:
     60  *
     61  * squeue_t *ip_squeue_get(hint)
     62  *
     63  * 	Find an squeue based on the 'hint' value. The hint is used as an index
     64  * 	in the array of IP squeues available. The way hint is computed may
     65  * 	affect the effectiveness of the squeue distribution. Currently squeues
     66  * 	are assigned in round-robin fashion using lbolt as a hint.
     67  *
     68  *
     69  * DR Notes
     70  * ========
     71  *
     72  * The ip_squeue_init() registers a call-back function with the CPU DR
     73  * subsystem using register_cpu_setup_func(). The call-back function does two
     74  * things:
     75  *
     76  * o When the CPU is going off-line or unconfigured, the worker thread is
     77  *	unbound from the CPU. This allows the CPU unconfig code to move it to
     78  *	another CPU.
     79  *
     80  * o When the CPU is going online, it creates a new squeue for this CPU if
     81  *	necessary and binds the squeue worker thread to this CPU.
     82  *
     83  * TUNEBALES:
     84  *
     85  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
     86  * 	associated with an squeue instance.
     87  *
     88  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
     89  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
     90  *	an impact.
     91  *
     92  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
     93  *	otherwise get it from CPU->cpu_squeue.
     94  *
     95  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
     96  * changed using ndd on /dev/tcp or /dev/ip.
     97  *
     98  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
     99  *	created. This is the time squeue code waits before waking up the worker
    100  *	thread after queuing a request.
    101  */
    102 
    103 #include <sys/types.h>
    104 #include <sys/debug.h>
    105 #include <sys/kmem.h>
    106 #include <sys/cpuvar.h>
    107 
    108 #include <sys/cmn_err.h>
    109 
    110 #include <inet/common.h>
    111 #include <inet/ip.h>
    112 #include <inet/ip_if.h>
    113 #include <inet/nd.h>
    114 #include <inet/ipclassifier.h>
    115 #include <sys/types.h>
    116 #include <sys/conf.h>
    117 #include <sys/sunddi.h>
    118 #include <sys/dlpi.h>
    119 #include <sys/squeue_impl.h>
    120 
    121 /*
    122  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
    123  * mapping between squeue and NIC (or Rx ring) for performance reasons so
    124  * each squeue can uniquely own a NIC or a Rx ring and do polling
    125  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
    126  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
    127  * can be created dynamically as needed.
    128  */
    129 #define	MAX_SQUEUES_PER_CPU	32
    130 #define	MIN_SQUEUES_PER_CPU	1
    131 uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
    132 
    133 #define	IP_NUM_SOFT_RINGS	2
    134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
    135 
    136 /*
    137  * List of all created squeue sets. The size is protected by cpu_lock
    138  */
    139 squeue_set_t	**sqset_global_list;
    140 uint_t		sqset_global_size;
    141 
    142 int ip_squeue_bind = B_TRUE;
    143 int ip_squeue_profile = B_TRUE;
    144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
    145 
    146 /*
    147  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
    148  *	created. This is the time squeue code waits before waking up the worker
    149  *	thread after queuing a request.
    150  */
    151 uint_t ip_squeue_worker_wait = 10;
    152 
    153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
    154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
    155 
    156 static void ip_squeue_set_bind(squeue_set_t *);
    157 static void ip_squeue_set_unbind(squeue_set_t *);
    158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t);
    159 static void ip_squeue_clean(void *, mblk_t *, void *);
    160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
    161 
    162 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
    163 
    164 /*
    165  * Create squeue set containing ip_squeues_per_cpu number of squeues
    166  * for this CPU and bind them all to the CPU.
    167  */
    168 static squeue_set_t *
    169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
    170 {
    171 	int i;
    172 	squeue_set_t	*sqs;
    173 	squeue_t 	*sqp;
    174 	char 		sqname[64];
    175 	processorid_t 	id = cp->cpu_id;
    176 
    177 	if (reuse) {
    178 		int i;
    179 
    180 		/*
    181 		 * We may already have an squeue created for this CPU. Try to
    182 		 * find one and reuse it if possible.
    183 		 */
    184 		for (i = 0; i < sqset_global_size; i++) {
    185 			sqs = sqset_global_list[i];
    186 			if (id == sqs->sqs_bind)
    187 				return (sqs);
    188 		}
    189 	}
    190 
    191 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
    192 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
    193 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
    194 	sqs->sqs_list = (squeue_t **)&sqs[1];
    195 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
    196 	sqs->sqs_bind = id;
    197 
    198 	for (i = 0; i < ip_squeues_per_cpu; i++) {
    199 		bzero(sqname, sizeof (sqname));
    200 
    201 		(void) snprintf(sqname, sizeof (sqname),
    202 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
    203 		    cp->cpu_id, i);
    204 
    205 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
    206 		    minclsyspri);
    207 
    208 		/*
    209 		 * The first squeue in each squeue_set is the DEFAULT
    210 		 * squeue.
    211 		 */
    212 		sqp->sq_state |= SQS_DEFAULT;
    213 
    214 		ASSERT(sqp != NULL);
    215 
    216 		squeue_profile_enable(sqp);
    217 		sqs->sqs_list[sqs->sqs_size++] = sqp;
    218 
    219 		if (ip_squeue_create_callback != NULL)
    220 			ip_squeue_create_callback(sqp);
    221 	}
    222 
    223 	if (ip_squeue_bind && cpu_is_online(cp))
    224 		ip_squeue_set_bind(sqs);
    225 
    226 	sqset_global_list[sqset_global_size++] = sqs;
    227 	ASSERT(sqset_global_size <= NCPU);
    228 	return (sqs);
    229 }
    230 
    231 /*
    232  * Initialize IP squeues.
    233  */
    234 void
    235 ip_squeue_init(void (*callback)(squeue_t *))
    236 {
    237 	int i;
    238 
    239 	ASSERT(sqset_global_list == NULL);
    240 
    241 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
    242 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
    243 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
    244 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
    245 
    246 	ip_squeue_create_callback = callback;
    247 	squeue_init();
    248 	sqset_global_list =
    249 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
    250 	sqset_global_size = 0;
    251 	mutex_enter(&cpu_lock);
    252 
    253 	/* Create squeue for each active CPU available */
    254 	for (i = 0; i < NCPU; i++) {
    255 		cpu_t *cp = cpu[i];
    256 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
    257 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
    258 		}
    259 	}
    260 
    261 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
    262 
    263 	mutex_exit(&cpu_lock);
    264 
    265 	if (ip_squeue_profile)
    266 		squeue_profile_start();
    267 }
    268 
    269 /*
    270  * Get squeue_t structure based on index.
    271  * Since the squeue list can only grow, no need to grab any lock.
    272  */
    273 squeue_t *
    274 ip_squeue_random(uint_t index)
    275 {
    276 	squeue_set_t *sqs;
    277 
    278 	sqs = sqset_global_list[index % sqset_global_size];
    279 	return (sqs->sqs_list[index % sqs->sqs_size]);
    280 }
    281 
    282 /* ARGSUSED */
    283 static void
    284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
    285 {
    286 	squeue_t	*sqp = arg2;
    287 	ill_rx_ring_t	*ring = (ill_rx_ring_t *)mp->b_wptr;
    288 	ill_t		*ill;
    289 
    290 	ASSERT(sqp != NULL);
    291 	mp->b_wptr = NULL;
    292 
    293 	if (ring == NULL) {
    294 		return;
    295 	}
    296 
    297 	/*
    298 	 * Clean up squeue
    299 	 */
    300 	mutex_enter(&sqp->sq_lock);
    301 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
    302 	sqp->sq_rx_ring = NULL;
    303 	mutex_exit(&sqp->sq_lock);
    304 
    305 	ill = ring->rr_ill;
    306 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
    307 		ASSERT(ring->rr_handle != NULL);
    308 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
    309 	}
    310 
    311 	/*
    312 	 * Cleanup the ring
    313 	 */
    314 
    315 	ring->rr_blank = NULL;
    316 	ring->rr_handle = NULL;
    317 	ring->rr_sqp = NULL;
    318 
    319 	/*
    320 	 * Signal ill that cleanup is done
    321 	 */
    322 	mutex_enter(&ill->ill_lock);
    323 	ring->rr_ring_state = ILL_RING_FREE;
    324 	cv_signal(&ill->ill_cv);
    325 	mutex_exit(&ill->ill_lock);
    326 }
    327 
    328 /*
    329  * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
    330  * The real cleanup happens behind the squeue via ip_squeue_clean function but
    331  * we need to protect ourselves from 2 threads trying to cleanup at the same
    332  * time (possible with one port going down for aggr and someone tearing down the
    333  * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
    334  * to indicate when the cleanup has started (1 ref) and when the cleanup
    335  * is done (0 ref). When a new ring gets assigned to squeue, we start by
    336  * putting 2 ref on ill_inuse_ref.
    337  */
    338 static void
    339 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    340 {
    341 	conn_t *connp;
    342 	squeue_t *sqp;
    343 	mblk_t *mp;
    344 
    345 	ASSERT(rx_ring != NULL);
    346 
    347 	/* Just clean one squeue */
    348 	mutex_enter(&ill->ill_lock);
    349 	/*
    350 	 * Reset the ILL_SOFT_RING_ASSIGN bit so that
    351 	 * ip_squeue_soft_ring_affinty() will not go
    352 	 * ahead with assigning rings.
    353 	 */
    354 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
    355 	while (rx_ring->rr_ring_state == ILL_RING_INPROC)
    356 		/* Some operations pending on the ring. Wait */
    357 		cv_wait(&ill->ill_cv, &ill->ill_lock);
    358 
    359 	if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
    360 		/*
    361 		 * Someone already trying to clean
    362 		 * this squeue or it's already been cleaned.
    363 		 */
    364 		mutex_exit(&ill->ill_lock);
    365 		return;
    366 	}
    367 	sqp = rx_ring->rr_sqp;
    368 
    369 	if (sqp == NULL) {
    370 		/*
    371 		 * The rx_ring never had a squeue assigned to it.
    372 		 * We are under ill_lock so we can clean it up
    373 		 * here itself since no one can get to it.
    374 		 */
    375 		rx_ring->rr_blank = NULL;
    376 		rx_ring->rr_handle = NULL;
    377 		rx_ring->rr_sqp = NULL;
    378 		rx_ring->rr_ring_state = ILL_RING_FREE;
    379 		mutex_exit(&ill->ill_lock);
    380 		return;
    381 	}
    382 
    383 	/* Indicate that it's being cleaned */
    384 	rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
    385 	ASSERT(sqp != NULL);
    386 	mutex_exit(&ill->ill_lock);
    387 
    388 	/*
    389 	 * Use the preallocated ill_unbind_conn for this purpose
    390 	 */
    391 	connp = ill->ill_dls_capab->ill_unbind_conn;
    392 
    393 	if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
    394 		connp->conn_tcp->tcp_closemp_used = B_TRUE;
    395 	} else {
    396 		cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
    397 		    "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
    398 		    (void *)connp, (void *)connp->conn_tcp);
    399 	}
    400 
    401 	TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
    402 	mp = &connp->conn_tcp->tcp_closemp;
    403 	CONN_INC_REF(connp);
    404 
    405 	/*
    406 	 * Since the field sq_rx_ring for default squeue is NULL,
    407 	 * ip_squeue_clean() will have no way to get the ring if we
    408 	 * don't pass the pointer to it. We use b_wptr to do so
    409 	 * as use of b_wptr for any other purpose is not expected.
    410 	 */
    411 
    412 	ASSERT(mp->b_wptr == NULL);
    413 	mp->b_wptr = (unsigned char *)rx_ring;
    414 	squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
    415 
    416 	mutex_enter(&ill->ill_lock);
    417 	while (rx_ring->rr_ring_state != ILL_RING_FREE)
    418 		cv_wait(&ill->ill_cv, &ill->ill_lock);
    419 	mutex_exit(&ill->ill_lock);
    420 }
    421 
    422 void
    423 ip_squeue_clean_all(ill_t *ill)
    424 {
    425 	int idx;
    426 
    427 	/*
    428 	 * No need to clean if poll_capab isn't set for this ill
    429 	 */
    430 	if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
    431 		return;
    432 
    433 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
    434 		ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
    435 
    436 		ip_squeue_clean_ring(ill, ipr);
    437 	}
    438 
    439 	ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
    440 }
    441 
    442 typedef struct ip_taskq_arg {
    443 	ill_t		*ip_taskq_ill;
    444 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
    445 	cpu_t		*ip_taskq_cpu;
    446 } ip_taskq_arg_t;
    447 
    448 /*
    449  * Do a Rx ring to squeue binding. Find a unique squeue that is not
    450  * managing a receive ring. If no such squeue exists, dynamically
    451  * create a new one in the squeue set.
    452  *
    453  * The function runs via the system taskq. The ill passed as an
    454  * argument can't go away since we hold a ref. The lock order is
    455  * ill_lock -> sqs_lock -> sq_lock.
    456  *
    457  * If we are binding a Rx ring to a squeue attached to the offline CPU,
    458  * no need to check that because squeues are never destroyed once
    459  * created.
    460  */
    461 /* ARGSUSED */
    462 static void
    463 ip_squeue_extend(void *arg)
    464 {
    465 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
    466 	ill_t		*ill = sq_arg->ip_taskq_ill;
    467 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
    468 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
    469 	squeue_set_t 	*sqs;
    470 	squeue_t 	*sqp = NULL;
    471 
    472 	ASSERT(ill != NULL);
    473 	ASSERT(ill_rx_ring != NULL);
    474 	kmem_free(arg, sizeof (ip_taskq_arg_t));
    475 
    476 	/*
    477 	 * Make sure the CPU that originally took the interrupt still
    478 	 * exists.
    479 	 */
    480 	if (!CPU_ISON(intr_cpu))
    481 		intr_cpu = CPU;
    482 
    483 	sqs = intr_cpu->cpu_squeue_set;
    484 
    485 	/*
    486 	 * If this ill represents link aggregation, then there might be
    487 	 * multiple NICs trying to register them selves at the same time
    488 	 * and in order to ensure that test and assignment of free rings
    489 	 * is sequential, we need to hold the ill_lock.
    490 	 */
    491 	mutex_enter(&ill->ill_lock);
    492 	sqp = ip_find_unused_squeue(sqs, B_FALSE);
    493 	if (sqp == NULL) {
    494 		/*
    495 		 * We hit the max limit of squeues allowed per CPU.
    496 		 * Assign this rx_ring to DEFAULT squeue of the
    497 		 * interrupted CPU but the squeue will not manage
    498 		 * the ring. Also print a warning.
    499 		 */
    500 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
    501 		    "has max number of squeues. System performance might "
    502 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
    503 
    504 		/* the first squeue in the list is the default squeue */
    505 		sqp = sqs->sqs_list[0];
    506 		ASSERT(sqp != NULL);
    507 		ill_rx_ring->rr_sqp = sqp;
    508 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
    509 
    510 		mutex_exit(&ill->ill_lock);
    511 		ill_waiter_dcr(ill);
    512 		return;
    513 	}
    514 
    515 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
    516 	sqp->sq_rx_ring = ill_rx_ring;
    517 	ill_rx_ring->rr_sqp = sqp;
    518 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
    519 
    520 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
    521 	mutex_exit(&sqp->sq_lock);
    522 
    523 	mutex_exit(&ill->ill_lock);
    524 
    525 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
    526 	ill_waiter_dcr(ill);
    527 }
    528 
    529 /*
    530  * Do a Rx ring to squeue binding. Find a unique squeue that is not
    531  * managing a receive ring. If no such squeue exists, dynamically
    532  * create a new one in the squeue set.
    533  *
    534  * The function runs via the system taskq. The ill passed as an
    535  * argument can't go away since we hold a ref. The lock order is
    536  * ill_lock -> sqs_lock -> sq_lock.
    537  *
    538  * If we are binding a Rx ring to a squeue attached to the offline CPU,
    539  * no need to check that because squeues are never destroyed once
    540  * created.
    541  */
    542 /* ARGSUSED */
    543 static void
    544 ip_squeue_soft_ring_affinity(void *arg)
    545 {
    546 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
    547 	ill_t			*ill = sq_arg->ip_taskq_ill;
    548 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
    549 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
    550 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
    551 	cpu_t			*bind_cpu;
    552 	int			cpu_id = intr_cpu->cpu_id;
    553 	int			min_cpu_id, max_cpu_id;
    554 	boolean_t		enough_uniq_cpus = B_FALSE;
    555 	boolean_t		enough_cpus = B_FALSE;
    556 	squeue_set_t 		*sqs, *last_sqs;
    557 	squeue_t 		*sqp = NULL;
    558 	int			i, j;
    559 
    560 	ASSERT(ill != NULL);
    561 	kmem_free(arg, sizeof (ip_taskq_arg_t));
    562 
    563 	/*
    564 	 * Make sure the CPU that originally took the interrupt still
    565 	 * exists.
    566 	 */
    567 	if (!CPU_ISON(intr_cpu)) {
    568 		intr_cpu = CPU;
    569 		cpu_id = intr_cpu->cpu_id;
    570 	}
    571 
    572 	/*
    573 	 * If this ill represents link aggregation, then there might be
    574 	 * multiple NICs trying to register them selves at the same time
    575 	 * and in order to ensure that test and assignment of free rings
    576 	 * is sequential, we need to hold the ill_lock.
    577 	 */
    578 	mutex_enter(&ill->ill_lock);
    579 
    580 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
    581 		mutex_exit(&ill->ill_lock);
    582 		return;
    583 	}
    584 	/*
    585 	 * We need to fanout the interrupts from the NIC. We do that by
    586 	 * telling the driver underneath to create soft rings and use
    587 	 * worker threads (if the driver advertized SOFT_RING capability)
    588 	 * Its still a big performance win to if we can fanout to the
    589 	 * threads on the same core that is taking interrupts.
    590 	 *
    591 	 * Since we don't know the interrupt to CPU binding, we don't
    592 	 * assign any squeues or affinity to worker threads in the NIC.
    593 	 * At the time of the first interrupt, we know which CPU is
    594 	 * taking interrupts and try to find other threads on the same
    595 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
    596 	 * numbered sequentially for each core (XXX need something better
    597 	 * than this in future), find the lowest number and highest
    598 	 * number thread for that core.
    599 	 *
    600 	 * If we have one more thread per core than number of soft rings,
    601 	 * then don't assign any worker threads to the H/W thread (cpu)
    602 	 * taking interrupts (capability negotiation tries to ensure this)
    603 	 *
    604 	 * If the number of threads per core are same as the number of
    605 	 * soft rings, then assign the worker affinity and squeue to
    606 	 * the same cpu.
    607 	 *
    608 	 * Otherwise, just fanout to higher number CPUs starting from
    609 	 * the interrupted CPU.
    610 	 */
    611 
    612 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
    613 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
    614 
    615 	/*
    616 	 * Quickly check if there are enough CPUs present for fanout
    617 	 * and also max_cpu_id is less than the id of the active CPU.
    618 	 * We use the cpu_id stored in the last squeue_set to get
    619 	 * an idea. The scheme is by no means perfect since it doesn't
    620 	 * take into account CPU DR operations and the fact that
    621 	 * interrupts themselves might change. An ideal scenario
    622 	 * would be to ensure that interrupts run cpus by themselves
    623 	 * and worker threads never have affinity to those CPUs. If
    624 	 * the interrupts move to CPU which had a worker thread, it
    625 	 * should be changed. Probably callbacks similar to CPU offline
    626 	 * are needed to make it work perfectly.
    627 	 */
    628 	last_sqs = sqset_global_list[sqset_global_size - 1];
    629 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
    630 		if ((max_cpu_id - min_cpu_id) >
    631 		    ill_soft_ring->ill_dls_soft_ring_cnt)
    632 			enough_uniq_cpus = B_TRUE;
    633 		else if ((max_cpu_id - min_cpu_id) >=
    634 		    ill_soft_ring->ill_dls_soft_ring_cnt)
    635 			enough_cpus = B_TRUE;
    636 	}
    637 
    638 	j = 0;
    639 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
    640 		if (enough_uniq_cpus) {
    641 			if ((min_cpu_id + i) == cpu_id) {
    642 				j++;
    643 				continue;
    644 			}
    645 			bind_cpu = cpu[min_cpu_id + i];
    646 		} else if (enough_cpus) {
    647 			bind_cpu = cpu[min_cpu_id + i];
    648 		} else {
    649 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
    650 			bind_cpu = cpu[(cpu_id + i) % ncpus];
    651 		}
    652 
    653 		/*
    654 		 * Check if the CPU actually exist and active. If not,
    655 		 * use the interrupted CPU. ip_find_unused_squeue() will
    656 		 * find the right CPU to fanout anyway.
    657 		 */
    658 		if (!CPU_ISON(bind_cpu))
    659 			bind_cpu = intr_cpu;
    660 
    661 		sqs = bind_cpu->cpu_squeue_set;
    662 		ASSERT(sqs != NULL);
    663 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
    664 
    665 		sqp = ip_find_unused_squeue(sqs, B_TRUE);
    666 		if (sqp == NULL) {
    667 			/*
    668 			 * We hit the max limit of squeues allowed per CPU.
    669 			 * Assign this rx_ring to DEFAULT squeue of the
    670 			 * interrupted CPU but thesqueue will not manage
    671 			 * the ring. Also print a warning.
    672 			 */
    673 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
    674 			    "%d/%p already has max number of squeues. System "
    675 			    "performance might become suboptimal\n",
    676 			    sqs->sqs_bind, (void *)sqs);
    677 
    678 			/* the first squeue in the list is the default squeue */
    679 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
    680 			ASSERT(sqp != NULL);
    681 
    682 			ill_rx_ring->rr_sqp = sqp;
    683 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
    684 			continue;
    685 
    686 		}
    687 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
    688 		ill_rx_ring->rr_sqp = sqp;
    689 		sqp->sq_rx_ring = ill_rx_ring;
    690 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
    691 		sqp->sq_state |= SQS_ILL_BOUND;
    692 
    693 		/* assign affinity to soft ring */
    694 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
    695 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
    696 			    sqp->sq_bind);
    697 		}
    698 		mutex_exit(&sqp->sq_lock);
    699 	}
    700 	mutex_exit(&ill->ill_lock);
    701 
    702 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
    703 	    SOFT_RING_FANOUT);
    704 
    705 	mutex_enter(&ill->ill_lock);
    706 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
    707 	mutex_exit(&ill->ill_lock);
    708 
    709 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
    710 	ill_waiter_dcr(ill);
    711 }
    712 
    713 /* ARGSUSED */
    714 void
    715 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
    716     mblk_t *mp_chain, struct mac_header_info_s *mhip)
    717 {
    718 	ip_taskq_arg_t	*taskq_arg;
    719 	boolean_t	refheld;
    720 
    721 	mutex_enter(&ill->ill_lock);
    722 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
    723 		taskq_arg = (ip_taskq_arg_t *)
    724 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
    725 
    726 		if (taskq_arg == NULL)
    727 			goto out;
    728 
    729 		taskq_arg->ip_taskq_ill = ill;
    730 		taskq_arg->ip_taskq_ill_rx_ring = NULL;
    731 		taskq_arg->ip_taskq_cpu = CPU;
    732 
    733 		/*
    734 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
    735 		 * the next interrupt to schedule a task for calling
    736 		 * ip_squeue_soft_ring_affinity();
    737 		 */
    738 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
    739 	} else {
    740 		mutex_exit(&ill->ill_lock);
    741 		goto out;
    742 	}
    743 	mutex_exit(&ill->ill_lock);
    744 	refheld = ill_waiter_inc(ill);
    745 	if (refheld) {
    746 		if (taskq_dispatch(system_taskq,
    747 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
    748 			goto out;
    749 
    750 		/* release ref on ill if taskq dispatch fails */
    751 		ill_waiter_dcr(ill);
    752 	}
    753 	/*
    754 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
    755 	 * can be tried again later.
    756 	 */
    757 	mutex_enter(&ill->ill_lock);
    758 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
    759 	mutex_exit(&ill->ill_lock);
    760 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
    761 
    762 out:
    763 	ip_input(ill, NULL, mp_chain, mhip);
    764 }
    765 
    766 static squeue_t *
    767 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout)
    768 {
    769 	int 		i;
    770 	squeue_set_t	*best_sqs = NULL;
    771 	squeue_set_t	*curr_sqs = NULL;
    772 	int		min_sq = 0;
    773 	squeue_t 	*sqp = NULL;
    774 	char		sqname[64];
    775 	cpu_t		*bind_cpu;
    776 
    777 	/*
    778 	 * If fanout is set and the passed squeue_set already has some
    779 	 * squeues which are managing the NICs, try to find squeues on
    780 	 * unused CPU.
    781 	 */
    782 	if (sqs->sqs_size > 1 && fanout) {
    783 		/*
    784 		 * First check to see if any squeue on the CPU passed
    785 		 * is managing a NIC.
    786 		 */
    787 		for (i = 0; i < sqs->sqs_size; i++) {
    788 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
    789 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
    790 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
    791 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
    792 				break;
    793 			}
    794 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
    795 		}
    796 		if (i != sqs->sqs_size) {
    797 			best_sqs = NULL;
    798 
    799 			for (i = sqset_global_size - 1; i >= 0; i--) {
    800 				curr_sqs = sqset_global_list[i];
    801 				/*
    802 				 * Check and make sure the CPU that sqs
    803 				 * is bound to is valid. There could be
    804 				 * sqs's around whose CPUs could have
    805 				 * been DR'd out.
    806 				 */
    807 				mutex_enter(&cpu_lock);
    808 				if (cpu_get(curr_sqs->sqs_bind) != NULL) {
    809 					if (best_sqs == NULL) {
    810 						best_sqs = curr_sqs;
    811 						min_sq = curr_sqs->sqs_size;
    812 					} else if (curr_sqs->sqs_size <
    813 					    min_sq) {
    814 						best_sqs = curr_sqs;
    815 						min_sq = curr_sqs->sqs_size;
    816 					}
    817 				}
    818 				mutex_exit(&cpu_lock);
    819 			}
    820 
    821 			ASSERT(best_sqs != NULL);
    822 			sqs = best_sqs;
    823 		}
    824 	}
    825 
    826 	mutex_enter(&sqs->sqs_lock);
    827 
    828 	for (i = 0; i < sqs->sqs_size; i++) {
    829 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
    830 		if ((sqs->sqs_list[i]->sq_state &
    831 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
    832 			sqp = sqs->sqs_list[i];
    833 			break;
    834 		}
    835 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
    836 	}
    837 
    838 	if (sqp == NULL) {
    839 		/* Need to create a new squeue */
    840 		if (sqs->sqs_size == sqs->sqs_max_size) {
    841 			/*
    842 			 * Reached the max limit for squeue
    843 			 * we can allocate on this CPU.
    844 			 */
    845 			mutex_exit(&sqs->sqs_lock);
    846 			return (NULL);
    847 		}
    848 
    849 		mutex_enter(&cpu_lock);
    850 		if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) {
    851 			/* Too bad, CPU got DR'd out, return NULL */
    852 			mutex_exit(&cpu_lock);
    853 			mutex_exit(&sqs->sqs_lock);
    854 			return (NULL);
    855 		}
    856 
    857 		bzero(sqname, sizeof (sqname));
    858 		(void) snprintf(sqname, sizeof (sqname),
    859 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
    860 		    bind_cpu->cpu_id, sqs->sqs_size);
    861 		mutex_exit(&cpu_lock);
    862 
    863 		sqp = squeue_create(sqname, sqs->sqs_bind,
    864 		    ip_squeue_worker_wait, minclsyspri);
    865 
    866 		ASSERT(sqp != NULL);
    867 
    868 		squeue_profile_enable(sqp);
    869 		sqs->sqs_list[sqs->sqs_size++] = sqp;
    870 
    871 		if (ip_squeue_create_callback != NULL)
    872 			ip_squeue_create_callback(sqp);
    873 
    874 		if (ip_squeue_bind) {
    875 			mutex_enter(&cpu_lock);
    876 			bind_cpu = cpu_get(sqs->sqs_bind);
    877 			if (bind_cpu != NULL && cpu_is_online(bind_cpu)) {
    878 				squeue_bind(sqp, -1);
    879 			}
    880 			mutex_exit(&cpu_lock);
    881 		}
    882 		mutex_enter(&sqp->sq_lock);
    883 	}
    884 
    885 	mutex_exit(&sqs->sqs_lock);
    886 	ASSERT(sqp != NULL);
    887 	return (sqp);
    888 }
    889 
    890 /*
    891  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
    892  * owned by a squeue yet, do the assignment. When the NIC registers it
    893  * Rx rings with IP, we don't know where the interrupts will land and
    894  * hence we need to wait till this point to do the assignment.
    895  */
    896 squeue_t *
    897 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
    898 {
    899 	squeue_t 	*sqp;
    900 	ill_t 		*ill;
    901 	int		interrupt;
    902 	ip_taskq_arg_t	*taskq_arg;
    903 	boolean_t	refheld;
    904 
    905 	if (ill_rx_ring == NULL)
    906 		return (IP_SQUEUE_GET(lbolt));
    907 
    908 	sqp = ill_rx_ring->rr_sqp;
    909 	/*
    910 	 * Do a quick check. If it's not NULL, we are done.
    911 	 * Squeues are never destroyed so worse we will bind
    912 	 * this connection to a suboptimal squeue.
    913 	 *
    914 	 * This is the fast path case.
    915 	 */
    916 	if (sqp != NULL)
    917 		return (sqp);
    918 
    919 	ill = ill_rx_ring->rr_ill;
    920 	ASSERT(ill != NULL);
    921 
    922 	interrupt = servicing_interrupt();
    923 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
    924 	    KM_NOSLEEP);
    925 
    926 	mutex_enter(&ill->ill_lock);
    927 	/*
    928 	 * Check sqp under the lock again for atomicity. Possible race with
    929 	 * a previously scheduled ip_squeue_get -> ip_squeue_extend.
    930 	 * Do the ring to squeue binding only if we are in interrupt context
    931 	 * AND the ring is not already bound AND there is no one else trying
    932 	 * the bind already.
    933 	 */
    934 	sqp = ill_rx_ring->rr_sqp;
    935 	if (sqp != NULL || !interrupt ||
    936 	    ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
    937 		/*
    938 		 * Note that the ring might get bound once we drop the lock
    939 		 * below, if a previous request is in progress i.e. if the ring
    940 		 * state is ILL_RING_INPROC. The incoming connection on whose
    941 		 * behalf we are currently here might get a suboptimal squeue
    942 		 * via the call to IP_SQUEUE_GET below, but there is no
    943 		 * correctness issue.
    944 		 */
    945 		mutex_exit(&ill->ill_lock);
    946 		if (taskq_arg != NULL)
    947 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
    948 		if (sqp != NULL)
    949 			return (sqp);
    950 		return (IP_SQUEUE_GET(lbolt));
    951 	}
    952 
    953 	/*
    954 	 * No sqp assigned yet. Can't really do that in interrupt
    955 	 * context. Assign the default sqp to this connection and
    956 	 * trigger creation of new sqp and binding it to this ring
    957 	 * via taskq. Need to make sure ill stays around.
    958 	 */
    959 	taskq_arg->ip_taskq_ill = ill;
    960 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
    961 	taskq_arg->ip_taskq_cpu = CPU;
    962 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
    963 	mutex_exit(&ill->ill_lock);
    964 	refheld = ill_waiter_inc(ill);
    965 	if (refheld) {
    966 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
    967 		    taskq_arg, TQ_NOSLEEP) != NULL) {
    968 			return (IP_SQUEUE_GET(lbolt));
    969 		}
    970 	}
    971 	/*
    972 	 * The ill is closing and we could not get a reference on the ill OR
    973 	 * taskq_dispatch failed probably due to memory allocation failure.
    974 	 * We will try again next time.
    975 	 */
    976 	mutex_enter(&ill->ill_lock);
    977 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
    978 	mutex_exit(&ill->ill_lock);
    979 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
    980 	if (refheld)
    981 		ill_waiter_dcr(ill);
    982 
    983 	return (IP_SQUEUE_GET(lbolt));
    984 }
    985 
    986 /*
    987  * NDD hooks for setting ip_squeue_xxx tuneables.
    988  */
    989 
    990 /* ARGSUSED */
    991 int
    992 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
    993     caddr_t addr, cred_t *cr)
    994 {
    995 	int *bind_enabled = (int *)addr;
    996 	long new_value;
    997 	int i;
    998 
    999 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
   1000 		return (EINVAL);
   1001 
   1002 	if (ip_squeue_bind == new_value)
   1003 		return (0);
   1004 
   1005 	*bind_enabled = new_value;
   1006 	mutex_enter(&cpu_lock);
   1007 	if (new_value == 0) {
   1008 		for (i = 0; i < sqset_global_size; i++)
   1009 			ip_squeue_set_unbind(sqset_global_list[i]);
   1010 	} else {
   1011 		for (i = 0; i < sqset_global_size; i++)
   1012 			ip_squeue_set_bind(sqset_global_list[i]);
   1013 	}
   1014 
   1015 	mutex_exit(&cpu_lock);
   1016 	return (0);
   1017 }
   1018 
   1019 /*
   1020  * Set squeue profiling.
   1021  * 0 means "disable"
   1022  * 1 means "enable"
   1023  * 2 means "enable and reset"
   1024  */
   1025 /* ARGSUSED */
   1026 int
   1027 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
   1028     cred_t *cr)
   1029 {
   1030 	int *profile_enabled = (int *)cp;
   1031 	long new_value;
   1032 	squeue_set_t *sqs;
   1033 
   1034 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
   1035 		return (EINVAL);
   1036 
   1037 	if (new_value == 0)
   1038 		squeue_profile_stop();
   1039 	else if (new_value == 1)
   1040 		squeue_profile_start();
   1041 	else if (new_value == 2) {
   1042 		int i, j;
   1043 
   1044 		squeue_profile_stop();
   1045 		mutex_enter(&cpu_lock);
   1046 		for (i = 0; i < sqset_global_size; i++) {
   1047 			sqs = sqset_global_list[i];
   1048 			for (j = 0; j < sqs->sqs_size; j++) {
   1049 				squeue_profile_reset(sqs->sqs_list[j]);
   1050 			}
   1051 		}
   1052 		mutex_exit(&cpu_lock);
   1053 
   1054 		new_value = 1;
   1055 		squeue_profile_start();
   1056 	}
   1057 	*profile_enabled = new_value;
   1058 
   1059 	return (0);
   1060 }
   1061 
   1062 /*
   1063  * Reconfiguration callback
   1064  */
   1065 
   1066 /* ARGSUSED */
   1067 static int
   1068 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
   1069 {
   1070 	cpu_t *cp = cpu[id];
   1071 
   1072 	ASSERT(MUTEX_HELD(&cpu_lock));
   1073 	switch (what) {
   1074 	case CPU_CONFIG:
   1075 		/*
   1076 		 * A new CPU is added. Create an squeue for it but do not bind
   1077 		 * it yet.
   1078 		 */
   1079 		if (cp->cpu_squeue_set == NULL)
   1080 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
   1081 		break;
   1082 	case CPU_ON:
   1083 	case CPU_INIT:
   1084 	case CPU_CPUPART_IN:
   1085 		if (cp->cpu_squeue_set == NULL) {
   1086 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
   1087 		}
   1088 		if (ip_squeue_bind)
   1089 			ip_squeue_set_bind(cp->cpu_squeue_set);
   1090 		break;
   1091 	case CPU_UNCONFIG:
   1092 	case CPU_OFF:
   1093 	case CPU_CPUPART_OUT:
   1094 		ASSERT((cp->cpu_squeue_set != NULL) ||
   1095 		    (cp->cpu_flags & CPU_OFFLINE));
   1096 
   1097 		if (cp->cpu_squeue_set != NULL) {
   1098 			ip_squeue_set_unbind(cp->cpu_squeue_set);
   1099 		}
   1100 		break;
   1101 	default:
   1102 		break;
   1103 	}
   1104 	return (0);
   1105 }
   1106 
   1107 /* ARGSUSED */
   1108 static void
   1109 ip_squeue_set_bind(squeue_set_t *sqs)
   1110 {
   1111 	int i;
   1112 	squeue_t *sqp;
   1113 
   1114 	if (!ip_squeue_bind)
   1115 		return;
   1116 
   1117 	mutex_enter(&sqs->sqs_lock);
   1118 	for (i = 0; i < sqs->sqs_size; i++) {
   1119 		sqp = sqs->sqs_list[i];
   1120 		if (sqp->sq_state & SQS_BOUND)
   1121 			continue;
   1122 		squeue_bind(sqp, -1);
   1123 	}
   1124 	mutex_exit(&sqs->sqs_lock);
   1125 }
   1126 
   1127 static void
   1128 ip_squeue_set_unbind(squeue_set_t *sqs)
   1129 {
   1130 	int i;
   1131 	squeue_t *sqp;
   1132 
   1133 	mutex_enter(&sqs->sqs_lock);
   1134 	for (i = 0; i < sqs->sqs_size; i++) {
   1135 		sqp = sqs->sqs_list[i];
   1136 
   1137 		/*
   1138 		 * CPU is going offline. Remove the thread affinity
   1139 		 * for any soft ring threads the sque