Home | History | Annotate | Download | only in ip
      1      0      stevel /*
      2      0      stevel  * CDDL HEADER START
      3      0      stevel  *
      4      0      stevel  * The contents of this file are subject to the terms of the
      5   1503    ericheng  * Common Development and Distribution License (the "License").
      6   1503    ericheng  * You may not use this file except in compliance with the License.
      7      0      stevel  *
      8      0      stevel  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      0      stevel  * or http://www.opensolaris.org/os/licensing.
     10      0      stevel  * See the License for the specific language governing permissions
     11      0      stevel  * and limitations under the License.
     12      0      stevel  *
     13      0      stevel  * When distributing Covered Code, include this CDDL HEADER in each
     14      0      stevel  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      0      stevel  * If applicable, add the following below this CDDL HEADER, with the
     16      0      stevel  * fields enclosed by brackets "[]" replaced with your own identifying
     17      0      stevel  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      0      stevel  *
     19      0      stevel  * CDDL HEADER END
     20      0      stevel  */
     21      0      stevel /*
     22   9210  Thirumalai  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23      0      stevel  * Use is subject to license terms.
     24      0      stevel  */
     25      0      stevel 
     26      0      stevel /*
     27      0      stevel  * IP interface to squeues.
     28      0      stevel  *
     29   8275        Eric  * IP uses squeues to force serialization of packets, both incoming and
     30   8275        Eric  * outgoing. Each squeue is associated with a connection instance (conn_t)
     31   8275        Eric  * above, and a soft ring (if enabled) below. Each CPU will have a default
     32   8275        Eric  * squeue for outbound connections, and each soft ring of an interface will
     33   8275        Eric  * have an squeue to which it sends incoming packets. squeues are never
     34   8275        Eric  * destroyed, and if they become unused they are kept around against future
     35   8275        Eric  * needs.
     36      0      stevel  *
     37   8275        Eric  * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
     38   8275        Eric  * in the system there will be one squeue set, all of whose squeues will be
     39   8275        Eric  * bound to that CPU, plus one additional set known as the unbound set. Sets
     40   8275        Eric  * associated with CPUs will have one default squeue, for outbound
     41   8275        Eric  * connections, and a linked list of squeues used by various NICs for inbound
     42   8275        Eric  * packets. The unbound set also has a linked list of squeues, but no default
     43   8275        Eric  * squeue.
     44   8275        Eric  *
     45   8275        Eric  * When a CPU goes offline its squeue set is destroyed, and all its squeues
     46   8275        Eric  * are moved to the unbound set. When a CPU comes online, a new squeue set is
     47   8275        Eric  * created and the default set is searched for a default squeue formerly bound
     48   8275        Eric  * to this CPU. If no default squeue is found, a new one is created.
     49   8275        Eric  *
     50   8275        Eric  * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
     51   8275        Eric  * and not the squeue code. squeue.c will not touch them, and we can modify
     52   8275        Eric  * them without holding the squeue lock because of the guarantee that squeues
     53   8275        Eric  * are never destroyed. ip_squeue locks must be held, however.
     54   8275        Eric  *
     55   8275        Eric  * All the squeue sets are protected by a single lock, the sqset_lock. This
     56   8275        Eric  * is also used to protect the sq_next and sq_set fields of an squeue_t.
     57   8275        Eric  *
     58   8275        Eric  * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
     59      0      stevel  *
     60      0      stevel  * There are two modes of associating connection with squeues. The first mode
     61      0      stevel  * associates each connection with the CPU that creates the connection (either
     62      0      stevel  * during open time or during accept time). The second mode associates each
     63      0      stevel  * connection with a random CPU, effectively distributing load over all CPUs
     64      0      stevel  * and all squeues in the system. The mode is controlled by the
     65      0      stevel  * ip_squeue_fanout variable.
     66      0      stevel  *
     67      0      stevel  * NOTE: The fact that there is an association between each connection and
     68      0      stevel  * squeue and squeue and CPU does not mean that each connection is always
     69      0      stevel  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
     70      0      stevel  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
     71      0      stevel  * binding is only relevant for the worker thread.
     72      0      stevel  *
     73      0      stevel  * INTERFACE:
     74      0      stevel  *
     75   8275        Eric  * squeue_t *ip_squeue_get(ill_rx_ring_t)
     76      0      stevel  *
     77   8275        Eric  * Returns the squeue associated with an ill receive ring. If the ring is
     78   8275        Eric  * not bound to a CPU, and we're currently servicing the interrupt which
     79   8275        Eric  * generated the packet, then bind the squeue to CPU.
     80      0      stevel  *
     81      0      stevel  *
     82      0      stevel  * DR Notes
     83      0      stevel  * ========
     84      0      stevel  *
     85      0      stevel  * The ip_squeue_init() registers a call-back function with the CPU DR
     86      0      stevel  * subsystem using register_cpu_setup_func(). The call-back function does two
     87      0      stevel  * things:
     88      0      stevel  *
     89      0      stevel  * o When the CPU is going off-line or unconfigured, the worker thread is
     90      0      stevel  *	unbound from the CPU. This allows the CPU unconfig code to move it to
     91      0      stevel  *	another CPU.
     92      0      stevel  *
     93      0      stevel  * o When the CPU is going online, it creates a new squeue for this CPU if
     94      0      stevel  *	necessary and binds the squeue worker thread to this CPU.
     95      0      stevel  *
     96   8275        Eric  * TUNABLES:
     97      0      stevel  *
     98   8275        Eric  * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
     99   8275        Eric  * pick the default squeue from a random CPU, otherwise use our CPU's default
    100   8275        Eric  * squeue.
    101      0      stevel  *
    102   8275        Eric  * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
    103   8275        Eric  * /dev/ip.
    104      0      stevel  *
    105   8275        Eric  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
    106   8275        Eric  * created. This is the time squeue code waits before waking up the worker
    107   8275        Eric  * thread after queuing a request.
    108      0      stevel  */
    109      0      stevel 
    110      0      stevel #include <sys/types.h>
    111      0      stevel #include <sys/debug.h>
    112      0      stevel #include <sys/kmem.h>
    113      0      stevel #include <sys/cpuvar.h>
    114      0      stevel #include <sys/cmn_err.h>
    115      0      stevel 
    116      0      stevel #include <inet/common.h>
    117      0      stevel #include <inet/ip.h>
    118   8275        Eric #include <netinet/ip6.h>
    119      0      stevel #include <inet/ip_if.h>
    120   8275        Eric #include <inet/ip_ire.h>
    121      0      stevel #include <inet/nd.h>
    122      0      stevel #include <inet/ipclassifier.h>
    123      0      stevel #include <sys/types.h>
    124      0      stevel #include <sys/conf.h>
    125      0      stevel #include <sys/sunddi.h>
    126   2546    carlsonj #include <sys/dlpi.h>
    127      0      stevel #include <sys/squeue_impl.h>
    128   8275        Eric #include <sys/tihdr.h>
    129   8275        Eric #include <inet/udp_impl.h>
    130   8275        Eric #include <sys/strsubr.h>
    131   8275        Eric #include <sys/zone.h>
    132   8275        Eric #include <sys/dld.h>
    133   8130      George #include <sys/atomic.h>
    134      0      stevel 
    135      0      stevel /*
    136   8275        Eric  * List of all created squeue sets. The list and its size are protected by
    137   8275        Eric  * sqset_lock.
    138      0      stevel  */
    139   8275        Eric static squeue_set_t	**sqset_global_list; /* list 0 is the unbound list */
    140   8275        Eric static uint_t		sqset_global_size;
    141   8275        Eric kmutex_t		sqset_lock;
    142   1184      krgopi 
    143      0      stevel static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
    144      0      stevel 
    145      0      stevel /*
    146      0      stevel  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
    147      0      stevel  *	created. This is the time squeue code waits before waking up the worker
    148      0      stevel  *	thread after queuing a request.
    149      0      stevel  */
    150      0      stevel uint_t ip_squeue_worker_wait = 10;
    151      0      stevel 
    152   8275        Eric static squeue_t *ip_squeue_create(pri_t);
    153   8275        Eric static squeue_set_t *ip_squeue_set_create(processorid_t);
    154      0      stevel static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
    155   8275        Eric static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
    156   8275        Eric static void ip_squeue_set_destroy(cpu_t *);
    157   4360        meem static void ip_squeue_clean(void *, mblk_t *, void *);
    158      0      stevel 
    159      0      stevel #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
    160      0      stevel 
    161   8275        Eric static squeue_t *
    162   8275        Eric ip_squeue_create(pri_t pri)
    163   8275        Eric {
    164   8275        Eric 	squeue_t *sqp;
    165   8275        Eric 
    166   8275        Eric 	sqp = squeue_create(ip_squeue_worker_wait, pri);
    167   8275        Eric 	ASSERT(sqp != NULL);
    168   8275        Eric 	if (ip_squeue_create_callback != NULL)
    169   8275        Eric 		ip_squeue_create_callback(sqp);
    170   8275        Eric 	return (sqp);
    171   8275        Eric }
    172   8275        Eric 
    173      0      stevel /*
    174   8275        Eric  * Create a new squeue_set. If id == -1, then we're creating the unbound set,
    175   8275        Eric  * which should only happen once when we are first initialized. Otherwise id
    176   8275        Eric  * is the id of the CPU that needs a set, either because we are initializing
    177   8275        Eric  * or because the CPU has come online.
    178   8275        Eric  *
    179   8275        Eric  * If id != -1, then we need at a minimum to provide a default squeue for the
    180   8275        Eric  * new set. We search the unbound set for candidates, and if none are found we
    181   8275        Eric  * create a new one.
    182      0      stevel  */
    183      0      stevel static squeue_set_t *
    184   8275        Eric ip_squeue_set_create(processorid_t id)
    185      0      stevel {
    186      0      stevel 	squeue_set_t	*sqs;
    187   8275        Eric 	squeue_set_t	*src = sqset_global_list[0];
    188   8275        Eric 	squeue_t	**lastsqp, *sq;
    189   8275        Eric 	squeue_t	**defaultq_lastp = NULL;
    190      0      stevel 
    191   8275        Eric 	sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
    192   8275        Eric 	sqs->sqs_cpuid = id;
    193      0      stevel 
    194   8275        Eric 	if (id == -1) {
    195   8275        Eric 		ASSERT(sqset_global_size == 0);
    196   8275        Eric 		sqset_global_list[0] = sqs;
    197   8275        Eric 		sqset_global_size = 1;
    198   8275        Eric 		return (sqs);
    199      0      stevel 	}
    200      0      stevel 
    201   8275        Eric 	/*
    202   8275        Eric 	 * When we create an squeue set id != -1, we need to give it a
    203   8275        Eric 	 * default squeue, in order to support fanout of conns across
    204   8275        Eric 	 * CPUs. Try to find a former default squeue that matches this
    205   8275        Eric 	 * cpu id on the unbound squeue set. If no such squeue is found,
    206   9979  Thirumalai 	 * find some non-default TCP squeue that is free. If still no such
    207   8275        Eric 	 * candidate is found, create a new squeue.
    208   8275        Eric 	 */
    209      0      stevel 
    210   8275        Eric 	ASSERT(MUTEX_HELD(&cpu_lock));
    211   8275        Eric 	mutex_enter(&sqset_lock);
    212   8275        Eric 	lastsqp = &src->sqs_head;
    213      0      stevel 
    214   8275        Eric 	while (*lastsqp) {
    215   8275        Eric 		if ((*lastsqp)->sq_bind == id &&
    216   8275        Eric 		    (*lastsqp)->sq_state & SQS_DEFAULT) {
    217   9979  Thirumalai 			/*
    218   9979  Thirumalai 			 * Exact match. Former default squeue of cpu 'id'
    219   9979  Thirumalai 			 */
    220   9979  Thirumalai 			ASSERT(!((*lastsqp)->sq_state & SQS_ILL_BOUND));
    221   8275        Eric 			defaultq_lastp = lastsqp;
    222   8275        Eric 			break;
    223   8275        Eric 		}
    224   8275        Eric 		if (defaultq_lastp == NULL &&
    225   9979  Thirumalai 		    !((*lastsqp)->sq_state & (SQS_ILL_BOUND | SQS_DEFAULT))) {
    226   9979  Thirumalai 			/*
    227   9979  Thirumalai 			 * A free non-default TCP squeue
    228   9979  Thirumalai 			 */
    229   8275        Eric 			defaultq_lastp = lastsqp;
    230   8275        Eric 		}
    231   8275        Eric 		lastsqp = &(*lastsqp)->sq_next;
    232   9979  Thirumalai 	}
    233      0      stevel 
    234   9979  Thirumalai 	if (defaultq_lastp != NULL) {
    235   8275        Eric 		/* Remove from src set and set SQS_DEFAULT */
    236   8275        Eric 		sq = *defaultq_lastp;
    237   8275        Eric 		*defaultq_lastp = sq->sq_next;
    238   8275        Eric 		sq->sq_next = NULL;
    239   8275        Eric 		if (!(sq->sq_state & SQS_DEFAULT)) {
    240   8275        Eric 			mutex_enter(&sq->sq_lock);
    241   8275        Eric 			sq->sq_state |= SQS_DEFAULT;
    242   8275        Eric 			mutex_exit(&sq->sq_lock);
    243   8275        Eric 		}
    244   8275        Eric 	} else {
    245   8275        Eric 		sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
    246   8275        Eric 		sq->sq_state |= SQS_DEFAULT;
    247      0      stevel 	}
    248      0      stevel 
    249   8275        Eric 	sq->sq_set = sqs;
    250   8275        Eric 	sqs->sqs_default = sq;
    251   8275        Eric 	squeue_bind(sq, id); /* this locks squeue mutex */
    252      0      stevel 
    253   8275        Eric 	ASSERT(sqset_global_size <= NCPU);
    254      0      stevel 	sqset_global_list[sqset_global_size++] = sqs;
    255   8275        Eric 	mutex_exit(&sqset_lock);
    256      0      stevel 	return (sqs);
    257   8275        Eric }
    258   8275        Eric 
    259   8275        Eric /*
    260   8275        Eric  * Called by ill_ring_add() to find an squeue to associate with a new ring.
    261   8275        Eric  */
    262   8275        Eric 
    263   8275        Eric squeue_t *
    264   8275        Eric ip_squeue_getfree(pri_t pri)
    265   8275        Eric {
    266   8275        Eric 	squeue_set_t	*sqs = sqset_global_list[0];
    267   8275        Eric 	squeue_t	*sq;
    268   8275        Eric 
    269   8275        Eric 	mutex_enter(&sqset_lock);
    270   8275        Eric 	for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
    271   8275        Eric 		/*
    272   9979  Thirumalai 		 * Select a non-default TCP squeue that is free i.e. not
    273   9979  Thirumalai 		 * bound to any ill.
    274   8275        Eric 		 */
    275   8275        Eric 		if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
    276   8275        Eric 			break;
    277   8275        Eric 	}
    278   8275        Eric 
    279   8275        Eric 	if (sq == NULL) {
    280   8275        Eric 		sq = ip_squeue_create(pri);
    281   8275        Eric 		sq->sq_set = sqs;
    282   8275        Eric 		sq->sq_next = sqs->sqs_head;
    283   8275        Eric 		sqs->sqs_head = sq;
    284   8275        Eric 	}
    285   8275        Eric 
    286   8275        Eric 	ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
    287   8275        Eric 	    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
    288   8275        Eric 	    SQS_POLL_THR_QUIESCED)));
    289   8275        Eric 
    290   8275        Eric 	mutex_enter(&sq->sq_lock);
    291   8275        Eric 	sq->sq_state |= SQS_ILL_BOUND;
    292   8275        Eric 	mutex_exit(&sq->sq_lock);
    293   8275        Eric 	mutex_exit(&sqset_lock);
    294   8275        Eric 
    295   8275        Eric 	if (sq->sq_priority != pri) {
    296   8275        Eric 		thread_lock(sq->sq_worker);
    297   8275        Eric 		(void) thread_change_pri(sq->sq_worker, pri, 0);
    298   8275        Eric 		thread_unlock(sq->sq_worker);
    299   8275        Eric 
    300   8275        Eric 		thread_lock(sq->sq_poll_thr);
    301   8275        Eric 		(void) thread_change_pri(sq->sq_poll_thr, pri, 0);
    302   8275        Eric 		thread_unlock(sq->sq_poll_thr);
    303   8275        Eric 
    304   8275        Eric 		sq->sq_priority = pri;
    305   8275        Eric 	}
    306   8275        Eric 	return (sq);
    307      0      stevel }
    308      0      stevel 
    309      0      stevel /*
    310      0      stevel  * Initialize IP squeues.
    311      0      stevel  */
    312      0      stevel void
    313      0      stevel ip_squeue_init(void (*callback)(squeue_t *))
    314      0      stevel {
    315      0      stevel 	int i;
    316   8275        Eric 	squeue_set_t	*sqs;
    317      0      stevel 
    318      0      stevel 	ASSERT(sqset_global_list == NULL);
    319      0      stevel 
    320      0      stevel 	ip_squeue_create_callback = callback;
    321      0      stevel 	squeue_init();
    322   8275        Eric 	mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
    323      0      stevel 	sqset_global_list =
    324   8275        Eric 	    kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
    325      0      stevel 	sqset_global_size = 0;
    326   8275        Eric 	/*
    327   8275        Eric 	 * We are called at system boot time and we don't
    328   8275        Eric 	 * expect memory allocation failure.
    329   8275        Eric 	 */
    330   8275        Eric 	sqs = ip_squeue_set_create(-1);
    331   8275        Eric 	ASSERT(sqs != NULL);
    332   8275        Eric 
    333      0      stevel 	mutex_enter(&cpu_lock);
    334      0      stevel 	/* Create squeue for each active CPU available */
    335      0      stevel 	for (i = 0; i < NCPU; i++) {
    336   8275        Eric 		cpu_t *cp = cpu_get(i);
    337      0      stevel 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
    338   8275        Eric 			/*
    339   8275        Eric 			 * We are called at system boot time and we don't
    340   8275        Eric 			 * expect memory allocation failure then
    341   8275        Eric 			 */
    342   8275        Eric 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
    343   8275        Eric 			ASSERT(cp->cpu_squeue_set != NULL);
    344      0      stevel 		}
    345      0      stevel 	}
    346      0      stevel 
    347      0      stevel 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
    348      0      stevel 	mutex_exit(&cpu_lock);
    349      0      stevel }
    350      0      stevel 
    351      0      stevel /*
    352   8275        Eric  * Get a default squeue, either from the current CPU or a CPU derived by hash
    353   8275        Eric  * from the index argument, depending upon the setting of ip_squeue_fanout.
    354      0      stevel  */
    355      0      stevel squeue_t *
    356      0      stevel ip_squeue_random(uint_t index)
    357      0      stevel {
    358   8275        Eric 	squeue_set_t *sqs = NULL;
    359   8275        Eric 	squeue_t *sq;
    360      0      stevel 
    361   8275        Eric 	/*
    362   8275        Eric 	 * The minimum value of sqset_global_size is 2, one for the unbound
    363   8275        Eric 	 * squeue set and another for the squeue set of the zeroth CPU.
    364   8275        Eric 	 * Even though the value could be changing, it can never go below 2,
    365   8275        Eric 	 * so the assert does not need the lock protection.
    366   8275        Eric 	 */
    367   8275        Eric 	ASSERT(sqset_global_size > 1);
    368   8275        Eric 
    369   8275        Eric 	/* Protect against changes to sqset_global_list */
    370   8275        Eric 	mutex_enter(&sqset_lock);
    371   8275        Eric 
    372   8275        Eric 	if (!ip_squeue_fanout)
    373   8275        Eric 		sqs = CPU->cpu_squeue_set;
    374   8275        Eric 
    375   8275        Eric 	/*
    376   8275        Eric 	 * sqset_global_list[0] corresponds to the unbound squeue set.
    377   8275        Eric 	 * The computation below picks a set other than the unbound set.
    378   8275        Eric 	 */
    379   8275        Eric 	if (sqs == NULL)
    380   8275        Eric 		sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
    381   8275        Eric 	sq = sqs->sqs_default;
    382   8275        Eric 
    383   8275        Eric 	mutex_exit(&sqset_lock);
    384   8275        Eric 	ASSERT(sq);
    385   8275        Eric 	return (sq);
    386      0      stevel }
    387      0      stevel 
    388   8275        Eric /*
    389   8275        Eric  * Move squeue from its current set to newset. Not used for default squeues.
    390   8275        Eric  * Bind or unbind the worker thread as appropriate.
    391   8275        Eric  */
    392   8275        Eric 
    393   4360        meem static void
    394   8275        Eric ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
    395      0      stevel {
    396   8275        Eric 	squeue_set_t	*set;
    397   8275        Eric 	squeue_t	**lastsqp;
    398   8275        Eric 	processorid_t	cpuid = newset->sqs_cpuid;
    399      0      stevel 
    400   8275        Eric 	ASSERT(!(sq->sq_state & SQS_DEFAULT));
    401   8275        Eric 	ASSERT(!MUTEX_HELD(&sq->sq_lock));
    402   8275        Eric 	ASSERT(MUTEX_HELD(&sqset_lock));
    403      0      stevel 
    404   8275        Eric 	set = sq->sq_set;
    405   8275        Eric 	if (set == newset)
    406   8275        Eric 		return;
    407   8275        Eric 
    408   8275        Eric 	lastsqp = &set->sqs_head;
    409   8275        Eric 	while (*lastsqp != sq)
    410   8275        Eric 		lastsqp = &(*lastsqp)->sq_next;
    411   8275        Eric 
    412   8275        Eric 	*lastsqp = sq->sq_next;
    413   8275        Eric 	sq->sq_next = newset->sqs_head;
    414   8275        Eric 	newset->sqs_head = sq;
    415   8275        Eric 	sq->sq_set = newset;
    416   8275        Eric 	if (cpuid == -1)
    417   8275        Eric 		squeue_unbind(sq);
    418   8275        Eric 	else
    419   8275        Eric 		squeue_bind(sq, cpuid);
    420   8275        Eric }
    421   8275        Eric 
    422   8275        Eric /*
    423   8275        Eric  * Move squeue from its current set to cpuid's set and bind to cpuid.
    424   8275        Eric  */
    425   8275        Eric 
    426   8275        Eric int
    427   8275        Eric ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
    428   8275        Eric {
    429   8275        Eric 	cpu_t *cpu;
    430   8275        Eric 	squeue_set_t *set;
    431   8275        Eric 
    432   8275        Eric 	if (sq->sq_state & SQS_DEFAULT)
    433   8275        Eric 		return (-1);
    434   8275        Eric 
    435   8275        Eric 	ASSERT(MUTEX_HELD(&cpu_lock));
    436   8275        Eric 
    437   8275        Eric 	cpu = cpu_get(cpuid);
    438   8275        Eric 	if (!CPU_ISON(cpu))
    439   8275        Eric 		return (-1);
    440   8275        Eric 
    441   8275        Eric 	mutex_enter(&sqset_lock);
    442   8275        Eric 	set = cpu->cpu_squeue_set;
    443   8275        Eric 	if (set != NULL)
    444   8275        Eric 		ip_squeue_set_move(sq, set);
    445   8275        Eric 	mutex_exit(&sqset_lock);
    446   8275        Eric 	return ((set == NULL) ? -1 : 0);
    447   8275        Eric }
    448   8275        Eric 
    449   8275        Eric /*
    450   8275        Eric  * The mac layer is calling, asking us to move an squeue to a
    451   8275        Eric  * new CPU. This routine is called with cpu_lock held.
    452   8275        Eric  */
    453   8275        Eric void
    454   8275        Eric ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
    455   8275        Eric {
    456   8275        Eric 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    457   8275        Eric 	ASSERT(rx_ring->rr_ill == ill);
    458   8275        Eric 
    459   8275        Eric 	mutex_enter(&ill->ill_lock);
    460   8275        Eric 	if (rx_ring->rr_ring_state == RR_FREE ||
    461   8275        Eric 	    rx_ring->rr_ring_state == RR_FREE_INPROG) {
    462   8275        Eric 		mutex_exit(&ill->ill_lock);
    463      0      stevel 		return;
    464      0      stevel 	}
    465      0      stevel 
    466   8275        Eric 	if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
    467   8275        Eric 		rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
    468   8275        Eric 
    469   8275        Eric 	mutex_exit(&ill->ill_lock);
    470   8275        Eric }
    471   8275        Eric 
    472   8275        Eric void *
    473   8275        Eric ip_squeue_add_ring(ill_t *ill, void *mrp)
    474   8275        Eric {
    475   8275        Eric 	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
    476   8275        Eric 	ill_rx_ring_t		*rx_ring, *ring_tbl;
    477   8275        Eric 	int			ip_rx_index;
    478   8275        Eric 	squeue_t		*sq = NULL;
    479   8275        Eric 	pri_t			pri;
    480   8275        Eric 
    481   8275        Eric 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    482   8275        Eric 	ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
    483   8275        Eric 	ASSERT(ill->ill_dld_capab != NULL);
    484   8275        Eric 
    485   8275        Eric 	ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
    486   8275        Eric 
    487   8275        Eric 	mutex_enter(&ill->ill_lock);
    488   8275        Eric 	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
    489   8275        Eric 		rx_ring = &ring_tbl[ip_rx_index];
    490   8275        Eric 		if (rx_ring->rr_ring_state == RR_FREE)
    491   8275        Eric 			break;
    492   8275        Eric 	}
    493   8275        Eric 
    494   8275        Eric 	if (ip_rx_index == ILL_MAX_RINGS) {
    495   8275        Eric 		/*
    496   8275        Eric 		 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
    497   8275        Eric 		 * we have devices which can overwhelm this limit,
    498   8275        Eric 		 * ILL_MAX_RING should be made configurable. Meanwhile it
    499   8275        Eric 		 * cause no panic because driver will pass ip_input a NULL
    500   8275        Eric 		 * handle which will make IP allocate the default squeue and
    501   8275        Eric 		 * Polling mode will not be used for this ring.
    502   8275        Eric 		 */
    503   8275        Eric 		cmn_err(CE_NOTE,
    504   8275        Eric 		    "Reached maximum number of receiving rings (%d) for %s\n",
    505   8275        Eric 		    ILL_MAX_RINGS, ill->ill_name);
    506   8275        Eric 		mutex_exit(&ill->ill_lock);
    507   8275        Eric 		return (NULL);
    508   8275        Eric 	}
    509   8275        Eric 
    510   8275        Eric 	bzero(rx_ring, sizeof (ill_rx_ring_t));
    511   8275        Eric 	rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
    512   8275        Eric 	/* XXX: Hard code it to tcp accept for now */
    513   8275        Eric 	rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
    514   8275        Eric 
    515   8275        Eric 	rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
    516   8275        Eric 	rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
    517   8275        Eric 	rx_ring->rr_intr_disable =
    518   8275        Eric 	    (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
    519   8275        Eric 	rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
    520   8275        Eric 	rx_ring->rr_ill = ill;
    521   8275        Eric 
    522   8275        Eric 	pri = mrfp->mrf_flow_priority;
    523   8275        Eric 
    524   8275        Eric 	sq = ip_squeue_getfree(pri);
    525   8275        Eric 
    526   8275        Eric 	mutex_enter(&sq->sq_lock);
    527   8275        Eric 	sq->sq_rx_ring = rx_ring;
    528   8275        Eric 	rx_ring->rr_sqp = sq;
    529   8275        Eric 
    530   8275        Eric 	sq->sq_state |= SQS_POLL_CAPAB;
    531   8275        Eric 
    532   8275        Eric 	rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
    533   8275        Eric 	sq->sq_ill = ill;
    534   8275        Eric 	mutex_exit(&sq->sq_lock);
    535   8275        Eric 	mutex_exit(&ill->ill_lock);
    536   8275        Eric 
    537   8275        Eric 	DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
    538   8275        Eric 	    ip_rx_index, void *, mrfp->mrf_rx_arg);
    539   8275        Eric 
    540   8275        Eric 	/* Assign the squeue to the specified CPU as well */
    541   8275        Eric 	mutex_enter(&cpu_lock);
    542   8275        Eric 	(void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
    543   8275        Eric 	mutex_exit(&cpu_lock);
    544   8275        Eric 
    545   8275        Eric 	return (rx_ring);
    546   8275        Eric }
    547   8275        Eric 
    548   8275        Eric /*
    549   8275        Eric  * sanitize the squeue etc. Some of the processing
    550   8275        Eric  * needs to be done from inside the perimeter.
    551   8275        Eric  */
    552   8275        Eric void
    553   8275        Eric ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    554   8275        Eric {
    555   8275        Eric 	squeue_t *sqp;
    556   8275        Eric 
    557   8275        Eric 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    558   8275        Eric 	ASSERT(rx_ring != NULL);
    559   8275        Eric 
    560   8275        Eric 	/* Just clean one squeue */
    561   8275        Eric 	mutex_enter(&ill->ill_lock);
    562   8275        Eric 	if (rx_ring->rr_ring_state == RR_FREE) {
    563   8275        Eric 		mutex_exit(&ill->ill_lock);
    564   8275        Eric 		return;
    565   8275        Eric 	}
    566   8275        Eric 	rx_ring->rr_ring_state = RR_FREE_INPROG;
    567   8275        Eric 	sqp = rx_ring->rr_sqp;
    568   8275        Eric 
    569      0      stevel 	mutex_enter(&sqp->sq_lock);
    570   8275        Eric 	sqp->sq_state |= SQS_POLL_CLEANUP;
    571   8275        Eric 	cv_signal(&sqp->sq_worker_cv);
    572   8275        Eric 	mutex_exit(&ill->ill_lock);
    573   8275        Eric 	while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
    574   8275        Eric 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
    575   9979  Thirumalai 	sqp->sq_state &= ~SQS_POLL_CLEANUP_DONE;
    576   8275        Eric 
    577   8275        Eric 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
    578   8275        Eric 	    SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
    579   8275        Eric 	    SQS_POLL_THR_QUIESCED)));
    580   8275        Eric 
    581   8275        Eric 	cv_signal(&sqp->sq_worker_cv);
    582      0      stevel 	mutex_exit(&sqp->sq_lock);
    583      0      stevel 
    584   8275        Eric 	/*
    585   9979  Thirumalai 	 * Move the squeue to sqset_global_list[0] which holds the set of
    586   9979  Thirumalai 	 * squeues not bound to any cpu. Note that the squeue is still
    587   9979  Thirumalai 	 * considered bound to an ill as long as SQS_ILL_BOUND is set.
    588   8275        Eric 	 */
    589   8275        Eric 	mutex_enter(&sqset_lock);
    590   8275        Eric 	ip_squeue_set_move(sqp, sqset_global_list[0]);
    591   8275        Eric 	mutex_exit(&sqset_lock);
    592   9979  Thirumalai 
    593   9979  Thirumalai 	/*
    594   9979  Thirumalai 	 * CPU going offline can also trigger a move of the squeue to the
    595   9979  Thirumalai 	 * unbound set sqset_global_list[0]. However the squeue won't be
    596   9979  Thirumalai 	 * recycled for the next use as long as the SQS_ILL_BOUND flag
    597   9979  Thirumalai 	 * is set. Hence we clear the SQS_ILL_BOUND flag only towards the
    598   9979  Thirumalai 	 * end after the move.
    599   9979  Thirumalai 	 */
    600   9979  Thirumalai 	mutex_enter(&sqp->sq_lock);
    601   9979  Thirumalai 	sqp->sq_state &= ~SQS_ILL_BOUND;
    602   9979  Thirumalai 	mutex_exit(&sqp->sq_lock);
    603      0      stevel 
    604      0      stevel 	mutex_enter(&ill->ill_lock);
    605   8275        Eric 	rx_ring->rr_ring_state = RR_FREE;
    606      0      stevel 	mutex_exit(&ill->ill_lock);
    607   4360        meem }
    608   4360        meem 
    609   4360        meem /*
    610   8275        Eric  * Stop the squeue from polling. This needs to be done
    611   8275        Eric  * from inside the perimeter.
    612   4360        meem  */
    613   8275        Eric void
    614   8275        Eric ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    615   4360        meem {
    616   4360        meem 	squeue_t *sqp;
    617   4360        meem 
    618   8275        Eric 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    619   4360        meem 	ASSERT(rx_ring != NULL);
    620   4360        meem 
    621   8275        Eric 	sqp = rx_ring->rr_sqp;
    622   8275        Eric 	mutex_enter(&sqp->sq_lock);
    623   8275        Eric 	sqp->sq_state |= SQS_POLL_QUIESCE;
    624   8275        Eric 	cv_signal(&sqp->sq_worker_cv);
    625   8275        Eric 	while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
    626   8275        Eric 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
    627   8275        Eric 
    628   8275        Eric 	mutex_exit(&sqp->sq_lock);
    629   8275        Eric }
    630   8275        Eric 
    631   8275        Eric /*
    632   8275        Eric  * Restart polling etc. Needs to be inside the perimeter to
    633   8275        Eric  * prevent races.
    634   8275        Eric  */
    635   8275        Eric void
    636   8275        Eric ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
    637   8275        Eric {
    638   8275        Eric 	squeue_t *sqp;
    639   8275        Eric 
    640   8275        Eric 	ASSERT(ILL_MAC_PERIM_HELD(ill));
    641   8275        Eric 	ASSERT(rx_ring != NULL);
    642   8275        Eric 
    643   8275        Eric 	sqp = rx_ring->rr_sqp;
    644   8275        Eric 	mutex_enter(&sqp->sq_lock);
    645   4360        meem 	/*
    646   8275        Eric 	 * Handle change in number of rings between the quiesce and
    647   8275        Eric 	 * restart operations by checking for a previous quiesce before
    648   8275        Eric 	 * attempting a restart.
    649   4360        meem 	 */
    650   8275        Eric 	if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
    651   8275        Eric 		mutex_exit(&sqp->sq_lock);
    652   4360        meem 		return;
    653   4360        meem 	}
    654   8275        Eric 	sqp->sq_state |= SQS_POLL_RESTART;
    655   8275        Eric 	cv_signal(&sqp->sq_worker_cv);
    656   8275        Eric 	while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
    657   8275        Eric 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
    658   8275        Eric 	sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
    659   8275        Eric 	mutex_exit(&sqp->sq_lock);
    660   4360        meem }
    661   4360        meem 
    662   8275        Eric /*
    663   8275        Eric  * sanitize all squeues associated with the ill.
    664   8275        Eric  */
    665   4360        meem void
    666   4360        meem ip_squeue_clean_all(ill_t *ill)
    667   4360        meem {
    668   4360        meem 	int idx;
    669   8275        Eric 	ill_rx_ring_t	*rx_ring;
    670   4360        meem 
    671   4360        meem 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
    672   8275        Eric 		rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
    673   8275        Eric 		ip_squeue_clean_ring(ill, rx_ring);
    674   4360        meem 	}
    675   1184      krgopi }
    676   1184      krgopi 
    677   1184      krgopi /*
    678   8275        Eric  * Used by IP to get the squeue associated with a ring. If the squeue isn't
    679   8275        Eric  * yet bound to a CPU, and we're being called directly from the NIC's
    680   8275        Eric  * interrupt, then we know what CPU we want to assign the squeue to, so
    681   8275        Eric  * dispatch that task to a taskq.
    682      0      stevel  */
    683      0      stevel squeue_t *
    684      0      stevel ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
    685      0      stevel {
    686      0      stevel 	squeue_t 	*sqp;
    687      0      stevel 
    688   8275        Eric 	if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
    689  11066      rafael 		return (IP_SQUEUE_GET(CPU_PSEUDO_RANDOM()));
    690      0      stevel 
    691   8275        Eric 	return (sqp);
    692      0      stevel }
    693      0      stevel 
    694      0      stevel /*
    695   8275        Eric  * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
    696   8275        Eric  * squeues are unboudn and moved to the unbound set.
    697      0      stevel  */
    698   8275        Eric static void
    699   8275        Eric ip_squeue_set_destroy(cpu_t *cpu)
    700   8275        Eric {
    701   8275        Eric 	int i;
    702   8275        Eric 	squeue_t *sqp, *lastsqp = NULL;
    703   8275        Eric 	squeue_set_t *sqs, *unbound = sqset_global_list[0];
    704      0      stevel 
    705   8275        Eric 	mutex_enter(&sqset_lock);
    706   8275        Eric 	if ((sqs = cpu->cpu_squeue_set) == NULL) {
    707   8275        Eric 		mutex_exit(&sqset_lock);
    708   8275        Eric 		return;
    709      0      stevel 	}
    710      0      stevel 
    711   8275        Eric 	/* Move all squeues to unbound set */
    712      0      stevel 
    713   8275        Eric 	for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
    714   8275        Eric 		squeue_unbind(sqp);
    715   8275        Eric 		sqp->sq_set = unbound;
    716   8275        Eric 	}
    717   8275        Eric 	if (sqs->sqs_head) {
    718   8275        Eric 		lastsqp->sq_next = unbound->sqs_head;
    719   8275        Eric 		unbound->sqs_head = sqs->sqs_head;
    720   8275        Eric 	}
    721      0      stevel 
    722   8275        Eric 	/* Also move default squeue to unbound set */
    723      0      stevel 
    724   8275        Eric 	sqp = sqs->sqs_default;
    725   9979  Thirumalai 	ASSERT(sqp != NULL);
    726   8275        Eric 	ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
    727      0      stevel 
    728   8275        Eric 	sqp->sq_next = unbound->sqs_head;
    729   8275        Eric 	unbound->sqs_head = sqp;
    730   8275        Eric 	squeue_unbind(sqp);
    731   8275        Eric 	sqp->sq_set = unbound;
    732      0      stevel 
    733   8275        Eric 	for (i = 1; i < sqset_global_size; i++)
    734   8275        Eric 		if (sqset_global_list[i] == sqs)
    735   8275        Eric 			break;
    736      0      stevel 
    737   8275        Eric 	ASSERT(i < sqset_global_size);
    738   8275        Eric 	sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
    739   8275        Eric 	sqset_global_list[sqset_global_size - 1] = NULL;
    740   8275        Eric 	sqset_global_size--;
    741   8275        Eric 
    742   8275        Eric 	mutex_exit(&sqset_lock);
    743   8275        Eric 	kmem_free(sqs, sizeof (*sqs));
    744      0      stevel }
    745      0      stevel 
    746      0      stevel /*
    747      0      stevel  * Reconfiguration callback
    748      0      stevel  */
    749      0      stevel /* ARGSUSED */
    750      0      stevel static int
    751      0      stevel ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
    752      0      stevel {
    753   8275        Eric 	cpu_t *cp = cpu_get(id);
    754      0      stevel 
    755      0      stevel 	ASSERT(MUTEX_HELD(&cpu_lock));
    756      0      stevel 	switch (what) {
    757    405       akolb 	case CPU_CONFIG:
    758      0      stevel 	case CPU_ON:
    759      0      stevel 	case CPU_INIT:
    760      0      stevel 	case CPU_CPUPART_IN:
    761   9210  Thirumalai 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL)
    762   8275        Eric 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
    763      0      stevel 		break;
    764      0      stevel 	case CPU_UNCONFIG:
    765      0      stevel 	case CPU_OFF:
    766      0      stevel 	case CPU_CPUPART_OUT:
    767      0      stevel 		if (cp->cpu_squeue_set != NULL) {
    768   8275        Eric 			ip_squeue_set_destroy(cp);
    769   8275        Eric 			cp->cpu_squeue_set = NULL;
    770      0      stevel 		}
    771      0      stevel 		break;
    772      0      stevel 	default:
    773      0      stevel 		break;
    774      0      stevel 	}
    775      0      stevel 	return (0);
    776      0      stevel }
    777