Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*
     28  *
     29  * Copyright (c) 2004 Christian Limpach.
     30  * All rights reserved.
     31  *
     32  * Redistribution and use in source and binary forms, with or without
     33  * modification, are permitted provided that the following conditions
     34  * are met:
     35  * 1. Redistributions of source code must retain the above copyright
     36  *    notice, this list of conditions and the following disclaimer.
     37  * 2. Redistributions in binary form must reproduce the above copyright
     38  *    notice, this list of conditions and the following disclaimer in the
     39  *    documentation and/or other materials provided with the distribution.
     40  * 3. This section intentionally left blank.
     41  * 4. The name of the author may not be used to endorse or promote products
     42  *    derived from this software without specific prior written permission.
     43  *
     44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     54  */
     55 /*
     56  * Section 3 of the above license was updated in response to bug 6379571.
     57  */
     58 
     59 /*
     60  * xnf.c - GLDv3 network driver for domU.
     61  */
     62 
     63 /*
     64  * This driver uses four per-instance locks:
     65  *
     66  * xnf_gref_lock:
     67  *
     68  *    Protects access to the grant reference list stored in
     69  *    xnf_gref_head. Grant references should be acquired and released
     70  *    using gref_get() and gref_put() respectively.
     71  *
     72  * xnf_schedlock:
     73  *
     74  *    Protects:
     75  *    xnf_need_sched - used to record that a previous transmit attempt
     76  *       failed (and consequently it will be necessary to call
     77  *       mac_tx_update() when transmit resources are available).
     78  *    xnf_pending_multicast - the number of multicast requests that
     79  *       have been submitted to the backend for which we have not
     80  *       processed responses.
     81  *
     82  * xnf_txlock:
     83  *
     84  *    Protects the transmit ring (xnf_tx_ring) and associated
     85  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
     86  *
     87  * xnf_rxlock:
     88  *
     89  *    Protects the receive ring (xnf_rx_ring) and associated
     90  *    structures (notably xnf_rx_pkt_info).
     91  *
     92  * If driver-global state that affects both the transmit and receive
     93  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
     94  * held, in that order.
     95  *
     96  * xnf_schedlock is acquired both whilst holding xnf_txlock and
     97  * without. It should always be acquired after xnf_txlock if both are
     98  * held.
     99  *
    100  * Notes:
    101  * - atomic_add_64() is used to manipulate counters where we require
    102  *   accuracy. For counters intended only for observation by humans,
    103  *   post increment/decrement are used instead.
    104  */
    105 
    106 #include <sys/types.h>
    107 #include <sys/errno.h>
    108 #include <sys/param.h>
    109 #include <sys/sysmacros.h>
    110 #include <sys/systm.h>
    111 #include <sys/stream.h>
    112 #include <sys/strsubr.h>
    113 #include <sys/strsun.h>
    114 #include <sys/conf.h>
    115 #include <sys/ddi.h>
    116 #include <sys/devops.h>
    117 #include <sys/sunddi.h>
    118 #include <sys/sunndi.h>
    119 #include <sys/dlpi.h>
    120 #include <sys/ethernet.h>
    121 #include <sys/strsun.h>
    122 #include <sys/pattr.h>
    123 #include <inet/ip.h>
    124 #include <inet/ip_impl.h>
    125 #include <sys/gld.h>
    126 #include <sys/modctl.h>
    127 #include <sys/mac_provider.h>
    128 #include <sys/mac_ether.h>
    129 #include <sys/bootinfo.h>
    130 #include <sys/mach_mmu.h>
    131 #ifdef	XPV_HVM_DRIVER
    132 #include <sys/xpv_support.h>
    133 #include <sys/hypervisor.h>
    134 #else
    135 #include <sys/hypervisor.h>
    136 #include <sys/evtchn_impl.h>
    137 #include <sys/balloon_impl.h>
    138 #endif
    139 #include <xen/public/io/netif.h>
    140 #include <sys/gnttab.h>
    141 #include <xen/sys/xendev.h>
    142 #include <sys/sdt.h>
    143 #include <sys/note.h>
    144 #include <sys/debug.h>
    145 
    146 #include <io/xnf.h>
    147 
    148 #if defined(DEBUG) || defined(__lint)
    149 #define	XNF_DEBUG
    150 #endif
    151 
    152 #ifdef XNF_DEBUG
    153 int xnf_debug = 0;
    154 xnf_t *xnf_debug_instance = NULL;
    155 #endif
    156 
    157 /*
    158  * On a 32 bit PAE system physical and machine addresses are larger
    159  * than 32 bits.  ddi_btop() on such systems take an unsigned long
    160  * argument, and so addresses above 4G are truncated before ddi_btop()
    161  * gets to see them.  To avoid this, code the shift operation here.
    162  */
    163 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
    164 
    165 unsigned int	xnf_max_tx_frags = 1;
    166 
    167 /*
    168  * Should we use the multicast control feature if the backend provides
    169  * it?
    170  */
    171 boolean_t xnf_multicast_control = B_TRUE;
    172 
    173 /*
    174  * Received packets below this size are copied to a new streams buffer
    175  * rather than being desballoc'ed.
    176  *
    177  * This value is chosen to accommodate traffic where there are a large
    178  * number of small packets. For data showing a typical distribution,
    179  * see:
    180  *
    181  * Sinha07a:
    182  *	Rishi Sinha, Christos Papadopoulos, and John
    183  *	Heidemann. Internet Packet Size Distributions: Some
    184  *	Observations. Technical Report ISI-TR-2007-643,
    185  *	USC/Information Sciences Institute, May, 2007. Orignally
    186  *	released October 2005 as web page
    187  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
    188  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
    189  */
    190 size_t xnf_rx_copy_limit = 64;
    191 
    192 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
    193 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
    194 #define	INVALID_TX_ID		((uint16_t)-1)
    195 
    196 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
    197 #define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
    198 
    199 /* Required system entry points */
    200 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
    201 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
    202 
    203 /* Required driver entry points for Nemo */
    204 static int	xnf_start(void *);
    205 static void	xnf_stop(void *);
    206 static int	xnf_set_mac_addr(void *, const uint8_t *);
    207 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
    208 static int	xnf_set_promiscuous(void *, boolean_t);
    209 static mblk_t	*xnf_send(void *, mblk_t *);
    210 static uint_t	xnf_intr(caddr_t);
    211 static int	xnf_stat(void *, uint_t, uint64_t *);
    212 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
    213 
    214 /* Driver private functions */
    215 static int xnf_alloc_dma_resources(xnf_t *);
    216 static void xnf_release_dma_resources(xnf_t *);
    217 static void xnf_release_mblks(xnf_t *);
    218 
    219 static int xnf_buf_constructor(void *, void *, int);
    220 static void xnf_buf_destructor(void *, void *);
    221 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
    222 #pragma inline(xnf_buf_get)
    223 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
    224 #pragma inline(xnf_buf_put)
    225 static void xnf_buf_refresh(xnf_buf_t *);
    226 #pragma inline(xnf_buf_refresh)
    227 static void xnf_buf_recycle(xnf_buf_t *);
    228 
    229 static int xnf_tx_buf_constructor(void *, void *, int);
    230 static void xnf_tx_buf_destructor(void *, void *);
    231 
    232 static grant_ref_t gref_get(xnf_t *);
    233 #pragma inline(gref_get)
    234 static void gref_put(xnf_t *, grant_ref_t);
    235 #pragma inline(gref_put)
    236 
    237 static xnf_txid_t *txid_get(xnf_t *);
    238 #pragma inline(txid_get)
    239 static void txid_put(xnf_t *, xnf_txid_t *);
    240 #pragma inline(txid_put)
    241 
    242 void xnf_send_driver_status(int, int);
    243 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
    244 static int xnf_tx_clean_ring(xnf_t  *);
    245 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
    246     void *, void *);
    247 static boolean_t xnf_kstat_init(xnf_t *);
    248 static void xnf_rx_collect(xnf_t *);
    249 
    250 static mac_callbacks_t xnf_callbacks = {
    251 	MC_GETCAPAB,
    252 	xnf_stat,
    253 	xnf_start,
    254 	xnf_stop,
    255 	xnf_set_promiscuous,
    256 	xnf_set_multicast,
    257 	xnf_set_mac_addr,
    258 	xnf_send,
    259 	NULL,
    260 	xnf_getcapab
    261 };
    262 
    263 /* DMA attributes for network ring buffer */
    264 static ddi_dma_attr_t ringbuf_dma_attr = {
    265 	DMA_ATTR_V0,		/* version of this structure */
    266 	0,			/* lowest usable address */
    267 	0xffffffffffffffffULL,	/* highest usable address */
    268 	0x7fffffff,		/* maximum DMAable byte count */
    269 	MMU_PAGESIZE,		/* alignment in bytes */
    270 	0x7ff,			/* bitmap of burst sizes */
    271 	1,			/* minimum transfer */
    272 	0xffffffffU,		/* maximum transfer */
    273 	0xffffffffffffffffULL,	/* maximum segment length */
    274 	1,			/* maximum number of segments */
    275 	1,			/* granularity */
    276 	0,			/* flags (reserved) */
    277 };
    278 
    279 /* DMA attributes for transmit and receive data */
    280 static ddi_dma_attr_t buf_dma_attr = {
    281 	DMA_ATTR_V0,		/* version of this structure */
    282 	0,			/* lowest usable address */
    283 	0xffffffffffffffffULL,	/* highest usable address */
    284 	0x7fffffff,		/* maximum DMAable byte count */
    285 	MMU_PAGESIZE,		/* alignment in bytes */
    286 	0x7ff,			/* bitmap of burst sizes */
    287 	1,			/* minimum transfer */
    288 	0xffffffffU,		/* maximum transfer */
    289 	0xffffffffffffffffULL,	/* maximum segment length */
    290 	1,			/* maximum number of segments */
    291 	1,			/* granularity */
    292 	0,			/* flags (reserved) */
    293 };
    294 
    295 /* DMA access attributes for registers and descriptors */
    296 static ddi_device_acc_attr_t accattr = {
    297 	DDI_DEVICE_ATTR_V0,
    298 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
    299 	DDI_STRICTORDER_ACC
    300 };
    301 
    302 /* DMA access attributes for data: NOT to be byte swapped. */
    303 static ddi_device_acc_attr_t data_accattr = {
    304 	DDI_DEVICE_ATTR_V0,
    305 	DDI_NEVERSWAP_ACC,
    306 	DDI_STRICTORDER_ACC
    307 };
    308 
    309 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
    310     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
    311 
    312 static struct modldrv xnf_modldrv = {
    313 	&mod_driverops,
    314 	"Virtual Ethernet driver",
    315 	&xnf_dev_ops
    316 };
    317 
    318 static struct modlinkage modlinkage = {
    319 	MODREV_1, &xnf_modldrv, NULL
    320 };
    321 
    322 int
    323 _init(void)
    324 {
    325 	int r;
    326 
    327 	mac_init_ops(&xnf_dev_ops, "xnf");
    328 	r = mod_install(&modlinkage);
    329 	if (r != DDI_SUCCESS)
    330 		mac_fini_ops(&xnf_dev_ops);
    331 
    332 	return (r);
    333 }
    334 
    335 int
    336 _fini(void)
    337 {
    338 	return (EBUSY); /* XXPV should be removable */
    339 }
    340 
    341 int
    342 _info(struct modinfo *modinfop)
    343 {
    344 	return (mod_info(&modlinkage, modinfop));
    345 }
    346 
    347 /*
    348  * Acquire a grant reference.
    349  */
    350 static grant_ref_t
    351 gref_get(xnf_t *xnfp)
    352 {
    353 	grant_ref_t gref;
    354 
    355 	mutex_enter(&xnfp->xnf_gref_lock);
    356 
    357 	do {
    358 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
    359 
    360 	} while ((gref == INVALID_GRANT_REF) &&
    361 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
    362 
    363 	mutex_exit(&xnfp->xnf_gref_lock);
    364 
    365 	if (gref == INVALID_GRANT_REF) {
    366 		xnfp->xnf_stat_gref_failure++;
    367 	} else {
    368 		atomic_add_64(&xnfp->xnf_stat_gref_outstanding, 1);
    369 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
    370 			xnfp->xnf_stat_gref_peak =
    371 			    xnfp->xnf_stat_gref_outstanding;
    372 	}
    373 
    374 	return (gref);
    375 }
    376 
    377 /*
    378  * Release a grant reference.
    379  */
    380 static void
    381 gref_put(xnf_t *xnfp, grant_ref_t gref)
    382 {
    383 	ASSERT(gref != INVALID_GRANT_REF);
    384 
    385 	mutex_enter(&xnfp->xnf_gref_lock);
    386 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
    387 	mutex_exit(&xnfp->xnf_gref_lock);
    388 
    389 	atomic_add_64(&xnfp->xnf_stat_gref_outstanding, -1);
    390 }
    391 
    392 /*
    393  * Acquire a transmit id.
    394  */
    395 static xnf_txid_t *
    396 txid_get(xnf_t *xnfp)
    397 {
    398 	xnf_txid_t *tidp;
    399 
    400 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
    401 
    402 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
    403 		return (NULL);
    404 
    405 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
    406 
    407 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
    408 	xnfp->xnf_tx_pkt_id_head = tidp->next;
    409 	tidp->next = INVALID_TX_ID;
    410 
    411 	ASSERT(tidp->txbuf == NULL);
    412 
    413 	return (tidp);
    414 }
    415 
    416 /*
    417  * Release a transmit id.
    418  */
    419 static void
    420 txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
    421 {
    422 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
    423 	ASSERT(TX_ID_VALID(tidp->id));
    424 	ASSERT(tidp->next == INVALID_TX_ID);
    425 
    426 	tidp->txbuf = NULL;
    427 	tidp->next = xnfp->xnf_tx_pkt_id_head;
    428 	xnfp->xnf_tx_pkt_id_head = tidp->id;
    429 }
    430 
    431 /*
    432  * Get `wanted' slots in the transmit ring, waiting for at least that
    433  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
    434  * `wanted' to zero.
    435  *
    436  * Return the number of slots available.
    437  */
    438 static int
    439 tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
    440 {
    441 	int slotsfree;
    442 	boolean_t forced_clean = (wanted == 0);
    443 
    444 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
    445 
    446 	/* LINTED: constant in conditional context */
    447 	while (B_TRUE) {
    448 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
    449 
    450 		if ((slotsfree < wanted) || forced_clean)
    451 			slotsfree = xnf_tx_clean_ring(xnfp);
    452 
    453 		/*
    454 		 * If there are more than we need free, tell other
    455 		 * people to come looking again. We hold txlock, so we
    456 		 * are able to take our slots before anyone else runs.
    457 		 */
    458 		if (slotsfree > wanted)
    459 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
    460 
    461 		if (slotsfree >= wanted)
    462 			break;
    463 
    464 		if (!wait)
    465 			break;
    466 
    467 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
    468 	}
    469 
    470 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
    471 
    472 	return (slotsfree);
    473 }
    474 
    475 static int
    476 xnf_setup_rings(xnf_t *xnfp)
    477 {
    478 	domid_t			oeid;
    479 	struct xenbus_device	*xsd;
    480 	RING_IDX		i;
    481 	int			err;
    482 	xnf_txid_t		*tidp;
    483 	xnf_buf_t **bdescp;
    484 
    485 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
    486 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
    487 
    488 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
    489 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
    490 
    491 	err = gnttab_grant_foreign_access(oeid,
    492 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
    493 	if (err <= 0) {
    494 		err = -err;
    495 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
    496 		goto out;
    497 	}
    498 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
    499 
    500 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
    501 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
    502 
    503 	err = gnttab_grant_foreign_access(oeid,
    504 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
    505 	if (err <= 0) {
    506 		err = -err;
    507 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
    508 		goto out;
    509 	}
    510 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
    511 
    512 	mutex_enter(&xnfp->xnf_txlock);
    513 
    514 	/*
    515 	 * Setup/cleanup the TX ring.  Note that this can lose packets
    516 	 * after a resume, but we expect to stagger on.
    517 	 */
    518 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
    519 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
    520 	    i < NET_TX_RING_SIZE;
    521 	    i++, tidp++) {
    522 		xnf_txbuf_t *txp;
    523 
    524 		tidp->id = i;
    525 
    526 		txp = tidp->txbuf;
    527 		if (txp == NULL) {
    528 			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
    529 			txid_put(xnfp, tidp);
    530 			continue;
    531 		}
    532 
    533 		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
    534 		ASSERT(txp->tx_mp != NULL);
    535 
    536 		switch (txp->tx_type) {
    537 		case TX_DATA:
    538 			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
    539 			    == 0);
    540 
    541 			if (txp->tx_bdesc == NULL) {
    542 				(void) gnttab_end_foreign_access_ref(
    543 				    txp->tx_txreq.gref, 1);
    544 				gref_put(xnfp, txp->tx_txreq.gref);
    545 				(void) ddi_dma_unbind_handle(
    546 				    txp->tx_dma_handle);
    547 			} else {
    548 				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
    549 			}
    550 
    551 			freemsg(txp->tx_mp);
    552 			txid_put(xnfp, tidp);
    553 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
    554 
    555 			break;
    556 
    557 		case TX_MCAST_REQ:
    558 			txp->tx_type = TX_MCAST_RSP;
    559 			txp->tx_status = NETIF_RSP_DROPPED;
    560 			cv_broadcast(&xnfp->xnf_cv_multicast);
    561 
    562 			/*
    563 			 * The request consumed two slots in the ring,
    564 			 * yet only a single xnf_txid_t is used. Step
    565 			 * over the empty slot.
    566 			 */
    567 			i++;
    568 			ASSERT(i < NET_TX_RING_SIZE);
    569 
    570 			break;
    571 
    572 		case TX_MCAST_RSP:
    573 			break;
    574 		}
    575 	}
    576 
    577 	/* LINTED: constant in conditional context */
    578 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
    579 	/* LINTED: constant in conditional context */
    580 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
    581 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
    582 
    583 	mutex_exit(&xnfp->xnf_txlock);
    584 
    585 	mutex_enter(&xnfp->xnf_rxlock);
    586 
    587 	/*
    588 	 * Clean out any buffers currently posted to the receive ring
    589 	 * before we reset it.
    590 	 */
    591 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
    592 	    i < NET_RX_RING_SIZE;
    593 	    i++, bdescp++) {
    594 		if (*bdescp != NULL) {
    595 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
    596 			*bdescp = NULL;
    597 		}
    598 	}
    599 
    600 	/* LINTED: constant in conditional context */
    601 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
    602 	/* LINTED: constant in conditional context */
    603 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
    604 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
    605 
    606 	/*
    607 	 * Fill the ring with buffers.
    608 	 */
    609 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
    610 		xnf_buf_t *bdesc;
    611 
    612 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
    613 		VERIFY(bdesc != NULL);
    614 		xnf_rxbuf_hang(xnfp, bdesc);
    615 	}
    616 
    617 	/* LINTED: constant in conditional context */
    618 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
    619 
    620 	mutex_exit(&xnfp->xnf_rxlock);
    621 
    622 	return (0);
    623 
    624 out:
    625 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
    626 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
    627 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
    628 
    629 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
    630 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
    631 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
    632 
    633 	return (err);
    634 }
    635 
    636 /*
    637  * Connect driver to back end, called to set up communication with
    638  * back end driver both initially and on resume after restore/migrate.
    639  */
    640 void
    641 xnf_be_connect(xnf_t *xnfp)
    642 {
    643 	const char	*message;
    644 	xenbus_transaction_t xbt;
    645 	struct		xenbus_device *xsd;
    646 	char		*xsname;
    647 	int		err;
    648 
    649 	ASSERT(!xnfp->xnf_connected);
    650 
    651 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
    652 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
    653 
    654 	err = xnf_setup_rings(xnfp);
    655 	if (err != 0) {
    656 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
    657 		xenbus_dev_error(xsd, err, "setting up ring");
    658 		return;
    659 	}
    660 
    661 again:
    662 	err = xenbus_transaction_start(&xbt);
    663 	if (err != 0) {
    664 		xenbus_dev_error(xsd, EIO, "starting transaction");
    665 		return;
    666 	}
    667 
    668 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
    669 	    xnfp->xnf_tx_ring_ref);
    670 	if (err != 0) {
    671 		message = "writing tx ring-ref";
    672 		goto abort_transaction;
    673 	}
    674 
    675 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
    676 	    xnfp->xnf_rx_ring_ref);
    677 	if (err != 0) {
    678 		message = "writing rx ring-ref";
    679 		goto abort_transaction;
    680 	}
    681 
    682 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
    683 	    xnfp->xnf_evtchn);
    684 	if (err != 0) {
    685 		message = "writing event-channel";
    686 		goto abort_transaction;
    687 	}
    688 
    689 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
    690 	if (err != 0) {
    691 		message = "writing feature-rx-notify";
    692 		goto abort_transaction;
    693 	}
    694 
    695 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
    696 	if (err != 0) {
    697 		message = "writing request-rx-copy";
    698 		goto abort_transaction;
    699 	}
    700 
    701 	if (xnfp->xnf_be_mcast_control) {
    702 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
    703 		    "%d", 1);
    704 		if (err != 0) {
    705 			message = "writing request-multicast-control";
    706 			goto abort_transaction;
    707 		}
    708 	}
    709 
    710 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
    711 	if (err != 0) {
    712 		message = "switching state to XenbusStateConnected";
    713 		goto abort_transaction;
    714 	}
    715 
    716 	err = xenbus_transaction_end(xbt, 0);
    717 	if (err != 0) {
    718 		if (err == EAGAIN)
    719 			goto again;
    720 		xenbus_dev_error(xsd, err, "completing transaction");
    721 	}
    722 
    723 	return;
    724 
    725 abort_transaction:
    726 	(void) xenbus_transaction_end(xbt, 1);
    727 	xenbus_dev_error(xsd, err, "%s", message);
    728 }
    729 
    730 /*
    731  * Read configuration information from xenstore.
    732  */
    733 void
    734 xnf_read_config(xnf_t *xnfp)
    735 {
    736 	int err, be_cap;
    737 	char mac[ETHERADDRL * 3];
    738 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
    739 
    740 	err = xenbus_scanf(XBT_NULL, oename, "mac",
    741 	    "%s", (char *)&mac[0]);
    742 	if (err != 0) {
    743 		/*
    744 		 * bad: we're supposed to be set up with a proper mac
    745 		 * addr. at this point
    746 		 */
    747 		cmn_err(CE_WARN, "%s%d: no mac address",
    748 		    ddi_driver_name(xnfp->xnf_devinfo),
    749 		    ddi_get_instance(xnfp->xnf_devinfo));
    750 			return;
    751 	}
    752 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
    753 		err = ENOENT;
    754 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
    755 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
    756 		return;
    757 	}
    758 
    759 	err = xenbus_scanf(XBT_NULL, oename,
    760 	    "feature-rx-copy", "%d", &be_cap);
    761 	/*
    762 	 * If we fail to read the store we assume that the key is
    763 	 * absent, implying an older domain at the far end.  Older
    764 	 * domains cannot do HV copy.
    765 	 */
    766 	if (err != 0)
    767 		be_cap = 0;
    768 	xnfp->xnf_be_rx_copy = (be_cap != 0);
    769 
    770 	err = xenbus_scanf(XBT_NULL, oename,
    771 	    "feature-multicast-control", "%d", &be_cap);
    772 	/*
    773 	 * If we fail to read the store we assume that the key is
    774 	 * absent, implying an older domain at the far end.  Older
    775 	 * domains do not support multicast control.
    776 	 */
    777 	if (err != 0)
    778 		be_cap = 0;
    779 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
    780 }
    781 
    782 /*
    783  *  attach(9E) -- Attach a device to the system
    784  */
    785 static int
    786 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
    787 {
    788 	mac_register_t *macp;
    789 	xnf_t *xnfp;
    790 	int err;
    791 	char cachename[32];
    792 
    793 #ifdef XNF_DEBUG
    794 	if (xnf_debug & XNF_DEBUG_DDI)
    795 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
    796 		    (void *)devinfo);
    797 #endif
    798 
    799 	switch (cmd) {
    800 	case DDI_RESUME:
    801 		xnfp = ddi_get_driver_private(devinfo);
    802 		xnfp->xnf_gen++;
    803 
    804 		(void) xvdi_resume(devinfo);
    805 		(void) xvdi_alloc_evtchn(devinfo);
    806 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
    807 #ifdef XPV_HVM_DRIVER
    808 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
    809 		    xnfp);
    810 #else
    811 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
    812 		    (caddr_t)xnfp);
    813 #endif
    814 		return (DDI_SUCCESS);
    815 
    816 	case DDI_ATTACH:
    817 		break;
    818 
    819 	default:
    820 		return (DDI_FAILURE);
    821 	}
    822 
    823 	/*
    824 	 *  Allocate gld_mac_info_t and xnf_instance structures
    825 	 */
    826 	macp = mac_alloc(MAC_VERSION);
    827 	if (macp == NULL)
    828 		return (DDI_FAILURE);
    829 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
    830 
    831 	macp->m_dip = devinfo;
    832 	macp->m_driver = xnfp;
    833 	xnfp->xnf_devinfo = devinfo;
    834 
    835 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
    836 	macp->m_src_addr = xnfp->xnf_mac_addr;
    837 	macp->m_callbacks = &xnf_callbacks;
    838 	macp->m_min_sdu = 0;
    839 	macp->m_max_sdu = XNF_MAXPKT;
    840 
    841 	xnfp->xnf_running = B_FALSE;
    842 	xnfp->xnf_connected = B_FALSE;
    843 	xnfp->xnf_be_rx_copy = B_FALSE;
    844 	xnfp->xnf_be_mcast_control = B_FALSE;
    845 	xnfp->xnf_need_sched = B_FALSE;
    846 
    847 	xnfp->xnf_rx_head = NULL;
    848 	xnfp->xnf_rx_tail = NULL;
    849 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
    850 
    851 #ifdef XPV_HVM_DRIVER
    852 	/*
    853 	 * Report our version to dom0.
    854 	 */
    855 	if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
    856 	    HVMPV_XNF_VERS))
    857 		cmn_err(CE_WARN, "xnf: couldn't write version\n");
    858 #endif
    859 
    860 	/*
    861 	 * Get the iblock cookie with which to initialize the mutexes.
    862 	 */
    863 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
    864 	    != DDI_SUCCESS)
    865 		goto failure;
    866 
    867 	mutex_init(&xnfp->xnf_txlock,
    868 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    869 	mutex_init(&xnfp->xnf_rxlock,
    870 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    871 	mutex_init(&xnfp->xnf_schedlock,
    872 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    873 	mutex_init(&xnfp->xnf_gref_lock,
    874 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
    875 
    876 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
    877 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
    878 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
    879 
    880 	(void) sprintf(cachename, "xnf_buf_cache_%d",
    881 	    ddi_get_instance(devinfo));
    882 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
    883 	    sizeof (xnf_buf_t), 0,
    884 	    xnf_buf_constructor, xnf_buf_destructor,
    885 	    NULL, xnfp, NULL, 0);
    886 	if (xnfp->xnf_buf_cache == NULL)
    887 		goto failure_0;
    888 
    889 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
    890 	    ddi_get_instance(devinfo));
    891 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
    892 	    sizeof (xnf_txbuf_t), 0,
    893 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
    894 	    NULL, xnfp, NULL, 0);
    895 	if (xnfp->xnf_tx_buf_cache == NULL)
    896 		goto failure_1;
    897 
    898 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
    899 
    900 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
    901 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
    902 		    "driver data structures",
    903 		    ddi_get_instance(xnfp->xnf_devinfo));
    904 		goto failure_2;
    905 	}
    906 
    907 	xnfp->xnf_rx_ring.sring->rsp_event =
    908 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
    909 
    910 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
    911 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
    912 
    913 	/* set driver private pointer now */
    914 	ddi_set_driver_private(devinfo, xnfp);
    915 
    916 	if (!xnf_kstat_init(xnfp))
    917 		goto failure_3;
    918 
    919 	/*
    920 	 * Allocate an event channel, add the interrupt handler and
    921 	 * bind it to the event channel.
    922 	 */
    923 	(void) xvdi_alloc_evtchn(devinfo);
    924 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
    925 #ifdef XPV_HVM_DRIVER
    926 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
    927 #else
    928 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
    929 #endif
    930 
    931 	err = mac_register(macp, &xnfp->xnf_mh);
    932 	mac_free(macp);
    933 	macp = NULL;
    934 	if (err != 0)
    935 		goto failure_4;
    936 
    937 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
    938 	    != DDI_SUCCESS)
    939 		goto failure_5;
    940 
    941 #ifdef XPV_HVM_DRIVER
    942 	/*
    943 	 * In the HVM case, this driver essentially replaces a driver for
    944 	 * a 'real' PCI NIC. Without the "model" property set to
    945 	 * "Ethernet controller", like the PCI code does, netbooting does
    946 	 * not work correctly, as strplumb_get_netdev_path() will not find
    947 	 * this interface.
    948 	 */
    949 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
    950 	    "Ethernet controller");
    951 #endif
    952 
    953 #ifdef XNF_DEBUG
    954 	if (xnf_debug_instance == NULL)
    955 		xnf_debug_instance = xnfp;
    956 #endif
    957 
    958 	return (DDI_SUCCESS);
    959 
    960 failure_5:
    961 	(void) mac_unregister(xnfp->xnf_mh);
    962 
    963 failure_4:
    964 #ifdef XPV_HVM_DRIVER
    965 	ec_unbind_evtchn(xnfp->xnf_evtchn);
    966 	xvdi_free_evtchn(devinfo);
    967 #else
    968 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
    969 #endif
    970 	xnfp->xnf_evtchn = INVALID_EVTCHN;
    971 	kstat_delete(xnfp->xnf_kstat_aux);
    972 
    973 failure_3:
    974 	xnf_release_dma_resources(xnfp);
    975 
    976 failure_2:
    977 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
    978 
    979 failure_1:
    980 	kmem_cache_destroy(xnfp->xnf_buf_cache);
    981 
    982 failure_0:
    983 	cv_destroy(&xnfp->xnf_cv_tx_slots);
    984 	cv_destroy(&xnfp->xnf_cv_multicast);
    985 	cv_destroy(&xnfp->xnf_cv_state);
    986 
    987 	mutex_destroy(&xnfp->xnf_gref_lock);
    988 	mutex_destroy(&xnfp->xnf_schedlock);
    989 	mutex_destroy(&xnfp->xnf_rxlock);
    990 	mutex_destroy(&xnfp->xnf_txlock);
    991 
    992 failure:
    993 	kmem_free(xnfp, sizeof (*xnfp));
    994 	if (macp != NULL)
    995 		mac_free(macp);
    996 
    997 	return (DDI_FAILURE);
    998 }
    999 
   1000 /*  detach(9E) -- Detach a device from the system */
   1001 static int
   1002 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
   1003 {
   1004 	xnf_t *xnfp;		/* Our private device info */
   1005 
   1006 #ifdef XNF_DEBUG
   1007 	if (xnf_debug & XNF_DEBUG_DDI)
   1008 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
   1009 #endif
   1010 
   1011 	xnfp = ddi_get_driver_private(devinfo);
   1012 
   1013 	switch (cmd) {
   1014 	case DDI_SUSPEND:
   1015 #ifdef XPV_HVM_DRIVER
   1016 		ec_unbind_evtchn(xnfp->xnf_evtchn);
   1017 		xvdi_free_evtchn(devinfo);
   1018 #else
   1019 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
   1020 #endif
   1021 
   1022 		xvdi_suspend(devinfo);
   1023 
   1024 		mutex_enter(&xnfp->xnf_rxlock);
   1025 		mutex_enter(&xnfp->xnf_txlock);
   1026 
   1027 		xnfp->xnf_evtchn = INVALID_EVTCHN;
   1028 		xnfp->xnf_connected = B_FALSE;
   1029 		mutex_exit(&xnfp->xnf_txlock);
   1030 		mutex_exit(&xnfp->xnf_rxlock);
   1031 
   1032 		/* claim link to be down after disconnect */
   1033 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
   1034 		return (DDI_SUCCESS);
   1035 
   1036 	case DDI_DETACH:
   1037 		break;
   1038 
   1039 	default:
   1040 		return (DDI_FAILURE);
   1041 	}
   1042 
   1043 	if (xnfp->xnf_connected)
   1044 		return (DDI_FAILURE);
   1045 
   1046 	/*
   1047 	 * Cannot detach if we have xnf_buf_t outstanding.
   1048 	 */
   1049 	if (xnfp->xnf_stat_buf_allocated > 0)
   1050 		return (DDI_FAILURE);
   1051 
   1052 	if (mac_unregister(xnfp->xnf_mh) != 0)
   1053 		return (DDI_FAILURE);
   1054 
   1055 	kstat_delete(xnfp->xnf_kstat_aux);
   1056 
   1057 	/* Stop the receiver */
   1058 	xnf_stop(xnfp);
   1059 
   1060 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
   1061 
   1062 	/* Remove the interrupt */
   1063 #ifdef XPV_HVM_DRIVER
   1064 	ec_unbind_evtchn(xnfp->xnf_evtchn);
   1065 	xvdi_free_evtchn(devinfo);
   1066 #else
   1067 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
   1068 #endif
   1069 
   1070 	/* Release any pending xmit mblks */
   1071 	xnf_release_mblks(xnfp);
   1072 
   1073 	/* Release all DMA resources */
   1074 	xnf_release_dma_resources(xnfp);
   1075 
   1076 	cv_destroy(&xnfp->xnf_cv_tx_slots);
   1077 	cv_destroy(&xnfp->xnf_cv_multicast);
   1078 	cv_destroy(&xnfp->xnf_cv_state);
   1079 
   1080 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
   1081 	kmem_cache_destroy(xnfp->xnf_buf_cache);
   1082 
   1083 	mutex_destroy(&xnfp->xnf_gref_lock);
   1084 	mutex_destroy(&xnfp->xnf_schedlock);
   1085 	mutex_destroy(&xnfp->xnf_rxlock);
   1086 	mutex_destroy(&xnfp->xnf_txlock);
   1087 
   1088 	kmem_free(xnfp, sizeof (*xnfp));
   1089 
   1090 	return (DDI_SUCCESS);
   1091 }
   1092 
   1093 /*
   1094  *  xnf_set_mac_addr() -- set the physical network address on the board.
   1095  */
   1096 static int
   1097 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
   1098 {
   1099 	_NOTE(ARGUNUSED(arg, macaddr));
   1100 
   1101 	/*
   1102 	 * We can't set our macaddr.
   1103 	 */
   1104 	return (ENOTSUP);
   1105 }
   1106 
   1107 /*
   1108  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
   1109  *
   1110  *  Program the hardware to enable/disable the multicast address
   1111  *  in "mca".  Enable if "add" is true, disable if false.
   1112  */
   1113 static int
   1114 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
   1115 {
   1116 	xnf_t *xnfp = arg;
   1117 	xnf_txbuf_t *txp;
   1118 	int n_slots;
   1119 	RING_IDX slot;
   1120 	xnf_txid_t *tidp;
   1121 	netif_tx_request_t *txrp;
   1122 	struct netif_extra_info *erp;
   1123 	boolean_t notify, result;
   1124 
   1125 	/*
   1126 	 * If the backend does not support multicast control then we
   1127 	 * must assume that the right packets will just arrive.
   1128 	 */
   1129 	if (!xnfp->xnf_be_mcast_control)
   1130 		return (0);
   1131 
   1132 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
   1133 
   1134 	mutex_enter(&xnfp->xnf_txlock);
   1135 
   1136 	/*
   1137 	 * If we're not yet connected then claim success. This is
   1138 	 * acceptable because we refresh the entire set of multicast
   1139 	 * addresses when we get connected.
   1140 	 *
   1141 	 * We can't wait around here because the MAC layer expects
   1142 	 * this to be a non-blocking operation - waiting ends up
   1143 	 * causing a deadlock during resume.
   1144 	 */
   1145 	if (!xnfp->xnf_connected) {
   1146 		mutex_exit(&xnfp->xnf_txlock);
   1147 		return (0);
   1148 	}
   1149 
   1150 	/*
   1151 	 * 1. Acquire two slots in the ring.
   1152 	 * 2. Fill in the slots.
   1153 	 * 3. Request notification when the operation is done.
   1154 	 * 4. Kick the peer.
   1155 	 * 5. Wait for the response via xnf_tx_clean_ring().
   1156 	 */
   1157 
   1158 	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
   1159 	ASSERT(n_slots >= 2);
   1160 
   1161 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
   1162 	tidp = txid_get(xnfp);
   1163 	VERIFY(tidp != NULL);
   1164 
   1165 	txp->tx_type = TX_MCAST_REQ;
   1166 	txp->tx_slot = slot;
   1167 
   1168 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
   1169 	erp = (struct netif_extra_info *)
   1170 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
   1171 
   1172 	txrp->gref = 0;
   1173 	txrp->size = 0;
   1174 	txrp->offset = 0;
   1175 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
   1176 	txrp->id = txp->tx_txreq.id = tidp->id;
   1177 	txrp->flags = NETTXF_extra_info;
   1178 
   1179 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
   1180 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
   1181 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
   1182 
   1183 	tidp->txbuf = txp;
   1184 
   1185 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
   1186 
   1187 	mutex_enter(&xnfp->xnf_schedlock);
   1188 	xnfp->xnf_pending_multicast++;
   1189 	mutex_exit(&xnfp->xnf_schedlock);
   1190 
   1191 	/* LINTED: constant in conditional context */
   1192 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
   1193 	    notify);
   1194 	if (notify)
   1195 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
   1196 
   1197 	while (txp->tx_type == TX_MCAST_REQ)
   1198 		cv_wait(&xnfp->xnf_cv_multicast,
   1199 		    &xnfp->xnf_txlock);
   1200 
   1201 	ASSERT(txp->tx_type == TX_MCAST_RSP);
   1202 
   1203 	mutex_enter(&xnfp->xnf_schedlock);
   1204 	xnfp->xnf_pending_multicast--;
   1205 	mutex_exit(&xnfp->xnf_schedlock);
   1206 
   1207 	result = (txp->tx_status == NETIF_RSP_OKAY);
   1208 
   1209 	txid_put(xnfp, tidp);
   1210 
   1211 	mutex_exit(&xnfp->xnf_txlock);
   1212 
   1213 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1214 
   1215 	return (result ? 0 : 1);
   1216 }
   1217 
   1218 /*
   1219  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
   1220  *
   1221  *  Program the hardware to enable/disable promiscuous mode.
   1222  */
   1223 static int
   1224 xnf_set_promiscuous(void *arg, boolean_t on)
   1225 {
   1226 	_NOTE(ARGUNUSED(arg, on));
   1227 
   1228 	/*
   1229 	 * We can't really do this, but we pretend that we can in
   1230 	 * order that snoop will work.
   1231 	 */
   1232 	return (0);
   1233 }
   1234 
   1235 /*
   1236  * Clean buffers that we have responses for from the transmit ring.
   1237  */
   1238 static int
   1239 xnf_tx_clean_ring(xnf_t *xnfp)
   1240 {
   1241 	boolean_t work_to_do;
   1242 
   1243 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
   1244 
   1245 loop:
   1246 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
   1247 		RING_IDX cons, prod, i;
   1248 
   1249 		cons = xnfp->xnf_tx_ring.rsp_cons;
   1250 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
   1251 		membar_consumer();
   1252 		/*
   1253 		 * Clean tx requests from ring that we have responses
   1254 		 * for.
   1255 		 */
   1256 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
   1257 		for (i = cons; i != prod; i++) {
   1258 			netif_tx_response_t *trp;
   1259 			xnf_txid_t *tidp;
   1260 			xnf_txbuf_t *txp;
   1261 
   1262 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
   1263 			ASSERT(TX_ID_VALID(trp->id));
   1264 
   1265 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
   1266 			ASSERT(tidp->id == trp->id);
   1267 			ASSERT(tidp->next == INVALID_TX_ID);
   1268 
   1269 			txp = tidp->txbuf;
   1270 			ASSERT(txp != NULL);
   1271 			ASSERT(txp->tx_txreq.id == trp->id);
   1272 
   1273 			switch (txp->tx_type) {
   1274 			case TX_DATA:
   1275 				if (gnttab_query_foreign_access(
   1276 				    txp->tx_txreq.gref) != 0)
   1277 					cmn_err(CE_PANIC,
   1278 					    "tx grant %d still in use by "
   1279 					    "backend domain",
   1280 					    txp->tx_txreq.gref);
   1281 
   1282 				if (txp->tx_bdesc == NULL) {
   1283 					(void) gnttab_end_foreign_access_ref(
   1284 					    txp->tx_txreq.gref, 1);
   1285 					gref_put(xnfp, txp->tx_txreq.gref);
   1286 					(void) ddi_dma_unbind_handle(
   1287 					    txp->tx_dma_handle);
   1288 				} else {
   1289 					xnf_buf_put(xnfp, txp->tx_bdesc,
   1290 					    B_TRUE);
   1291 				}
   1292 
   1293 				freemsg(txp->tx_mp);
   1294 				txid_put(xnfp, tidp);
   1295 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1296 
   1297 				break;
   1298 
   1299 			case TX_MCAST_REQ:
   1300 				txp->tx_type = TX_MCAST_RSP;
   1301 				txp->tx_status = trp->status;
   1302 				cv_broadcast(&xnfp->xnf_cv_multicast);
   1303 
   1304 				break;
   1305 
   1306 			case TX_MCAST_RSP:
   1307 				break;
   1308 
   1309 			default:
   1310 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
   1311 				    "invalid xnf_txbuf_t type: %d",
   1312 				    txp->tx_type);
   1313 				break;
   1314 			}
   1315 		}
   1316 		/*
   1317 		 * Record the last response we dealt with so that we
   1318 		 * know where to start next time around.
   1319 		 */
   1320 		xnfp->xnf_tx_ring.rsp_cons = prod;
   1321 		membar_enter();
   1322 	}
   1323 
   1324 	/* LINTED: constant in conditional context */
   1325 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
   1326 	if (work_to_do)
   1327 		goto loop;
   1328 
   1329 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
   1330 }
   1331 
   1332 /*
   1333  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
   1334  * to ensure that the packet is physically contiguous and contained
   1335  * within a single page.
   1336  */
   1337 static xnf_buf_t *
   1338 xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
   1339 {
   1340 	xnf_buf_t *bd;
   1341 	caddr_t bp;
   1342 
   1343 	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
   1344 	if (bd == NULL)
   1345 		return (NULL);
   1346 
   1347 	bp = bd->buf;
   1348 	while (mp != NULL) {
   1349 		size_t len = MBLKL(mp);
   1350 
   1351 		bcopy(mp->b_rptr, bp, len);
   1352 		bp += len;
   1353 
   1354 		mp = mp->b_cont;
   1355 	}
   1356 
   1357 	ASSERT((bp - bd->buf) <= PAGESIZE);
   1358 
   1359 	xnfp->xnf_stat_tx_pullup++;
   1360 
   1361 	return (bd);
   1362 }
   1363 
   1364 /*
   1365  * Insert the pseudo-header checksum into the packet `buf'.
   1366  */
   1367 void
   1368 xnf_pseudo_cksum(caddr_t buf, int length)
   1369 {
   1370 	struct ether_header *ehp;
   1371 	uint16_t sap, len, *stuff;
   1372 	uint32_t cksum;
   1373 	size_t offset;
   1374 	ipha_t *ipha;
   1375 	ipaddr_t src, dst;
   1376 
   1377 	ASSERT(length >= sizeof (*ehp));
   1378 	ehp = (struct ether_header *)buf;
   1379 
   1380 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
   1381 		struct ether_vlan_header *evhp;
   1382 
   1383 		ASSERT(length >= sizeof (*evhp));
   1384 		evhp = (struct ether_vlan_header *)buf;
   1385 		sap = ntohs(evhp->ether_type);
   1386 		offset = sizeof (*evhp);
   1387 	} else {
   1388 		sap = ntohs(ehp->ether_type);
   1389 		offset = sizeof (*ehp);
   1390 	}
   1391 
   1392 	ASSERT(sap == ETHERTYPE_IP);
   1393 
   1394 	/* Packet should have been pulled up by the caller. */
   1395 	if ((offset + sizeof (ipha_t)) > length) {
   1396 		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
   1397 		return;
   1398 	}
   1399 
   1400 	ipha = (ipha_t *)(buf + offset);
   1401 
   1402 	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
   1403 
   1404 	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
   1405 
   1406 	switch (ipha->ipha_protocol) {
   1407 	case IPPROTO_TCP:
   1408 		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
   1409 		cksum = IP_TCP_CSUM_COMP;
   1410 		break;
   1411 	case IPPROTO_UDP:
   1412 		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
   1413 		cksum = IP_UDP_CSUM_COMP;
   1414 		break;
   1415 	default:
   1416 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
   1417 		    ipha->ipha_protocol);
   1418 		return;
   1419 	}
   1420 
   1421 	src = ipha->ipha_src;
   1422 	dst = ipha->ipha_dst;
   1423 
   1424 	cksum += (dst >> 16) + (dst & 0xFFFF);
   1425 	cksum += (src >> 16) + (src & 0xFFFF);
   1426 	cksum += htons(len);
   1427 
   1428 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
   1429 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
   1430 
   1431 	ASSERT(cksum <= 0xFFFF);
   1432 
   1433 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
   1434 }
   1435 
   1436 /*
   1437  * Push a list of prepared packets (`txp') into the transmit ring.
   1438  */
   1439 static xnf_txbuf_t *
   1440 tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
   1441 {
   1442 	int slots_free;
   1443 	RING_IDX slot;
   1444 	boolean_t notify;
   1445 
   1446 	mutex_enter(&xnfp->xnf_txlock);
   1447 
   1448 	ASSERT(xnfp->xnf_running);
   1449 
   1450 	/*
   1451 	 * Wait until we are connected to the backend.
   1452 	 */
   1453 	while (!xnfp->xnf_connected)
   1454 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
   1455 
   1456 	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
   1457 	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
   1458 
   1459 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
   1460 
   1461 	while ((txp != NULL) && (slots_free > 0)) {
   1462 		xnf_txid_t *tidp;
   1463 		netif_tx_request_t *txrp;
   1464 
   1465 		tidp = txid_get(xnfp);
   1466 		VERIFY(tidp != NULL);
   1467 
   1468 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
   1469 
   1470 		txp->tx_slot = slot;
   1471 		txp->tx_txreq.id = tidp->id;
   1472 		*txrp = txp->tx_txreq;
   1473 
   1474 		tidp->txbuf = txp;
   1475 
   1476 		xnfp->xnf_stat_opackets++;
   1477 		xnfp->xnf_stat_obytes += txp->tx_txreq.size;
   1478 
   1479 		txp = txp->tx_next;
   1480 		slots_free--;
   1481 		slot++;
   1482 
   1483 	}
   1484 
   1485 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
   1486 
   1487 	/*
   1488 	 * Tell the peer that we sent something, if it cares.
   1489 	 */
   1490 	/* LINTED: constant in conditional context */
   1491 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
   1492 	    notify);
   1493 	if (notify)
   1494 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
   1495 
   1496 	mutex_exit(&xnfp->xnf_txlock);
   1497 
   1498 	return (txp);
   1499 }
   1500 
   1501 /*
   1502  * Send the chain of packets `mp'. Called by the MAC framework.
   1503  */
   1504 static mblk_t *
   1505 xnf_send(void *arg, mblk_t *mp)
   1506 {
   1507 	xnf_t *xnfp = arg;
   1508 	domid_t oeid;
   1509 	xnf_txbuf_t *head, *tail;
   1510 	mblk_t *ml;
   1511 	int prepared;
   1512 
   1513 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
   1514 
   1515 	/*
   1516 	 * Prepare packets for transmission.
   1517 	 */
   1518 	head = tail = NULL;
   1519 	prepared = 0;
   1520 	while (mp != NULL) {
   1521 		xnf_txbuf_t *txp;
   1522 		int n_chunks, length;
   1523 		boolean_t page_oops;
   1524 		uint32_t pflags;
   1525 
   1526 		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
   1527 		    ml != NULL;
   1528 		    ml = ml->b_cont, n_chunks++) {
   1529 
   1530 			/*
   1531 			 * Test if this buffer includes a page
   1532 			 * boundary. The test assumes that the range
   1533 			 * b_rptr...b_wptr can include only a single
   1534 			 * boundary.
   1535 			 */
   1536 			if (xnf_btop((size_t)ml->b_rptr) !=
   1537 			    xnf_btop((size_t)ml->b_wptr)) {
   1538 				xnfp->xnf_stat_tx_pagebndry++;
   1539 				page_oops = B_TRUE;
   1540 			}
   1541 
   1542 			length += MBLKL(ml);
   1543 		}
   1544 		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
   1545 
   1546 		/*
   1547 		 * Make sure packet isn't too large.
   1548 		 */
   1549 		if (length > XNF_FRAMESIZE) {
   1550 			cmn_err(CE_WARN,
   1551 			    "xnf%d: oversized packet (%d bytes) dropped",
   1552 			    ddi_get_instance(xnfp->xnf_devinfo), length);
   1553 			freemsg(mp);
   1554 			continue;
   1555 		}
   1556 
   1557 		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
   1558 
   1559 		txp->tx_type = TX_DATA;
   1560 
   1561 		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
   1562 			/*
   1563 			 * Loan a side buffer rather than the mblk
   1564 			 * itself.
   1565 			 */
   1566 			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
   1567 			if (txp->tx_bdesc == NULL) {
   1568 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1569 				break;
   1570 			}
   1571 
   1572 			txp->tx_bufp = txp->tx_bdesc->buf;
   1573 			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
   1574 			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
   1575 
   1576 		} else {
   1577 			int rc;
   1578 			ddi_dma_cookie_t dma_cookie;
   1579 			uint_t ncookies;
   1580 
   1581 			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
   1582 			    NULL, (char *)mp->b_rptr, length,
   1583 			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
   1584 			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
   1585 			    &ncookies);
   1586 			if (rc != DDI_DMA_MAPPED) {
   1587 				ASSERT(rc != DDI_DMA_INUSE);
   1588 				ASSERT(rc != DDI_DMA_PARTIAL_MAP);
   1589 
   1590 #ifdef XNF_DEBUG
   1591 				if (rc != DDI_DMA_NORESOURCES)
   1592 					cmn_err(CE_WARN,
   1593 					    "xnf%d: bind_handle failed (%x)",
   1594 					    ddi_get_instance(xnfp->xnf_devinfo),
   1595 					    rc);
   1596 #endif
   1597 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1598 				break;
   1599 			}
   1600 			ASSERT(ncookies == 1);
   1601 
   1602 			txp->tx_bdesc = NULL;
   1603 			txp->tx_bufp = (caddr_t)mp->b_rptr;
   1604 			txp->tx_mfn =
   1605 			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
   1606 			txp->tx_txreq.gref = gref_get(xnfp);
   1607 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
   1608 				(void) ddi_dma_unbind_handle(
   1609 				    txp->tx_dma_handle);
   1610 				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   1611 				break;
   1612 			}
   1613 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
   1614 			    oeid, txp->tx_mfn, 1);
   1615 		}
   1616 
   1617 		txp->tx_next = NULL;
   1618 		txp->tx_mp = mp;
   1619 		txp->tx_txreq.size = length;
   1620 		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
   1621 		txp->tx_txreq.flags = 0;
   1622 		hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL,
   1623 		    &pflags);
   1624 		if (pflags != 0) {
   1625 			/*
   1626 			 * If the local protocol stack requests checksum
   1627 			 * offload we set the 'checksum blank' flag,
   1628 			 * indicating to the peer that we need the checksum
   1629 			 * calculated for us.
   1630 			 *
   1631 			 * We _don't_ set the validated flag, because we haven't
   1632 			 * validated that the data and the checksum match.
   1633 			 */
   1634 			xnf_pseudo_cksum(txp->tx_bufp, length);
   1635 			txp->tx_txreq.flags |= NETTXF_csum_blank;
   1636 
   1637 			xnfp->xnf_stat_tx_cksum_deferred++;
   1638 		}
   1639 
   1640 		if (head == NULL) {
   1641 			ASSERT(tail == NULL);
   1642 
   1643 			head = txp;
   1644 		} else {
   1645 			ASSERT(tail != NULL);
   1646 
   1647 			tail->tx_next = txp;
   1648 		}
   1649 		tail = txp;
   1650 
   1651 		mp = mp->b_next;
   1652 		prepared++;
   1653 
   1654 		/*
   1655 		 * There is no point in preparing more than
   1656 		 * NET_TX_RING_SIZE, as we won't be able to push them
   1657 		 * into the ring in one go and would hence have to
   1658 		 * un-prepare the extra.
   1659 		 */
   1660 		if (prepared == NET_TX_RING_SIZE)
   1661 			break;
   1662 	}
   1663 
   1664 	DTRACE_PROBE1(xnf_send_prepared, int, prepared);
   1665 
   1666 	if (mp != NULL) {
   1667 #ifdef XNF_DEBUG
   1668 		int notprepared = 0;
   1669 		mblk_t *l = mp;
   1670 
   1671 		while (l != NULL) {
   1672 			notprepared++;
   1673 			l = l->b_next;
   1674 		}
   1675 
   1676 		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
   1677 #else /* !XNF_DEBUG */
   1678 		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
   1679 #endif /* XNF_DEBUG */
   1680 	}
   1681 
   1682 	/*
   1683 	 * Push the packets we have prepared into the ring. They may
   1684 	 * not all go.
   1685 	 */
   1686 	if (head != NULL)
   1687 		head = tx_push_packets(xnfp, head);
   1688 
   1689 	/*
   1690 	 * If some packets that we prepared were not sent, unprepare
   1691 	 * them and add them back to the head of those we didn't
   1692 	 * prepare.
   1693 	 */
   1694 	{
   1695 		xnf_txbuf_t *loop;
   1696 		mblk_t *mp_head, *mp_tail;
   1697 		int unprepared = 0;
   1698 
   1699 		mp_head = mp_tail = NULL;
   1700 		loop = head;
   1701 
   1702 		while (loop != NULL) {
   1703 			xnf_txbuf_t *next = loop->tx_next;
   1704 
   1705 			if (loop->tx_bdesc == NULL) {
   1706 				(void) gnttab_end_foreign_access_ref(
   1707 				    loop->tx_txreq.gref, 1);
   1708 				gref_put(xnfp, loop->tx_txreq.gref);
   1709 				(void) ddi_dma_unbind_handle(
   1710 				    loop->tx_dma_handle);
   1711 			} else {
   1712 				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
   1713 			}
   1714 
   1715 			ASSERT(loop->tx_mp != NULL);
   1716 			if (mp_head == NULL)
   1717 				mp_head = loop->tx_mp;
   1718 			mp_tail = loop->tx_mp;
   1719 
   1720 			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
   1721 			loop = next;
   1722 			unprepared++;
   1723 		}
   1724 
   1725 		if (mp_tail == NULL) {
   1726 			ASSERT(mp_head == NULL);
   1727 		} else {
   1728 			ASSERT(mp_head != NULL);
   1729 
   1730 			mp_tail->b_next = mp;
   1731 			mp = mp_head;
   1732 		}
   1733 
   1734 		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
   1735 	}
   1736 
   1737 	/*
   1738 	 * If any mblks are left then we have deferred for some reason
   1739 	 * and need to ask for a re-schedule later. This is typically
   1740 	 * due to the ring filling.
   1741 	 */
   1742 	if (mp != NULL) {
   1743 		mutex_enter(&xnfp->xnf_schedlock);
   1744 		xnfp->xnf_need_sched = B_TRUE;
   1745 		mutex_exit(&xnfp->xnf_schedlock);
   1746 
   1747 		xnfp->xnf_stat_tx_defer++;
   1748 	}
   1749 
   1750 	return (mp);
   1751 }
   1752 
   1753 /*
   1754  * Notification of RX packets. Currently no TX-complete interrupt is
   1755  * used, as we clean the TX ring lazily.
   1756  */
   1757 static uint_t
   1758 xnf_intr(caddr_t arg)
   1759 {
   1760 	xnf_t *xnfp = (xnf_t *)arg;
   1761 	mblk_t *mp;
   1762 	boolean_t need_sched, clean_ring;
   1763 
   1764 	mutex_enter(&xnfp->xnf_rxlock);
   1765 
   1766 	/*
   1767 	 * Interrupts before we are connected are spurious.
   1768 	 */
   1769 	if (!xnfp->xnf_connected) {
   1770 		mutex_exit(&xnfp->xnf_rxlock);
   1771 		xnfp->xnf_stat_unclaimed_interrupts++;
   1772 		return (DDI_INTR_UNCLAIMED);
   1773 	}
   1774 
   1775 	/*
   1776 	 * Receive side processing.
   1777 	 */
   1778 	do {
   1779 		/*
   1780 		 * Collect buffers from the ring.
   1781 		 */
   1782 		xnf_rx_collect(xnfp);
   1783 
   1784 		/*
   1785 		 * Interrupt me when the next receive buffer is consumed.
   1786 		 */
   1787 		xnfp->xnf_rx_ring.sring->rsp_event =
   1788 		    xnfp->xnf_rx_ring.rsp_cons + 1;
   1789 		xen_mb();
   1790 
   1791 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
   1792 
   1793 	if (xnfp->xnf_rx_new_buffers_posted) {
   1794 		boolean_t notify;
   1795 
   1796 		/*
   1797 		 * Indicate to the peer that we have re-filled the
   1798 		 * receive ring, if it cares.
   1799 		 */
   1800 		/* LINTED: constant in conditional context */
   1801 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
   1802 		if (notify)
   1803 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
   1804 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
   1805 	}
   1806 
   1807 	mp = xnfp->xnf_rx_head;
   1808 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
   1809 
   1810 	xnfp->xnf_stat_interrupts++;
   1811 	mutex_exit(&xnfp->xnf_rxlock);
   1812 
   1813 	if (mp != NULL)
   1814 		mac_rx(xnfp->xnf_mh, NULL, mp);
   1815 
   1816 	/*
   1817 	 * Transmit side processing.
   1818 	 *
   1819 	 * If a previous transmit attempt failed or we have pending
   1820 	 * multicast requests, clean the ring.
   1821 	 *
   1822 	 * If we previously stalled transmission and cleaning produces
   1823 	 * some free slots, tell upstream to attempt sending again.
   1824 	 *
   1825 	 * The odd style is to avoid acquiring xnf_txlock unless we
   1826 	 * will actually look inside the tx machinery.
   1827 	 */
   1828 	mutex_enter(&xnfp->xnf_schedlock);
   1829 	need_sched = xnfp->xnf_need_sched;
   1830 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
   1831 	mutex_exit(&xnfp->xnf_schedlock);
   1832 
   1833 	if (clean_ring) {
   1834 		int free_slots;
   1835 
   1836 		mutex_enter(&xnfp->xnf_txlock);
   1837 		free_slots = tx_slots_get(xnfp, 0, B_FALSE);
   1838 
   1839 		if (need_sched && (free_slots > 0)) {
   1840 			mutex_enter(&xnfp->xnf_schedlock);
   1841 			xnfp->xnf_need_sched = B_FALSE;
   1842 			mutex_exit(&xnfp->xnf_schedlock);
   1843 
   1844 			mac_tx_update(xnfp->xnf_mh);
   1845 		}
   1846 		mutex_exit(&xnfp->xnf_txlock);
   1847 	}
   1848 
   1849 	return (DDI_INTR_CLAIMED);
   1850 }
   1851 
   1852 /*
   1853  *  xnf_start() -- start the board receiving and enable interrupts.
   1854  */
   1855 static int
   1856 xnf_start(void *arg)
   1857 {
   1858 	xnf_t *xnfp = arg;
   1859 
   1860 #ifdef XNF_DEBUG
   1861 	if (xnf_debug & XNF_DEBUG_TRACE)
   1862 		printf("xnf%d start(0x%p)\n",
   1863 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
   1864 #endif
   1865 
   1866 	mutex_enter(&xnfp->xnf_rxlock);
   1867 	mutex_enter(&xnfp->xnf_txlock);
   1868 
   1869 	/* Accept packets from above. */
   1870 	xnfp->xnf_running = B_TRUE;
   1871 
   1872 	mutex_exit(&xnfp->xnf_txlock);
   1873 	mutex_exit(&xnfp->xnf_rxlock);
   1874 
   1875 	return (0);
   1876 }
   1877 
   1878 /* xnf_stop() - disable hardware */
   1879 static void
   1880 xnf_stop(void *arg)
   1881 {
   1882 	xnf_t *xnfp = arg;
   1883 
   1884 #ifdef XNF_DEBUG
   1885 	if (xnf_debug & XNF_DEBUG_TRACE)
   1886 		printf("xnf%d stop(0x%p)\n",
   1887 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
   1888 #endif
   1889 
   1890 	mutex_enter(&xnfp->xnf_rxlock);
   1891 	mutex_enter(&xnfp->xnf_txlock);
   1892 
   1893 	xnfp->xnf_running = B_FALSE;
   1894 
   1895 	mutex_exit(&xnfp->xnf_txlock);
   1896 	mutex_exit(&xnfp->xnf_rxlock);
   1897 }
   1898 
   1899 /*
   1900  * Hang buffer `bdesc' on the RX ring.
   1901  */
   1902 static void
   1903 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
   1904 {
   1905 	netif_rx_request_t *reqp;
   1906 	RING_IDX hang_ix;
   1907 
   1908 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
   1909 
   1910 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
   1911 	    xnfp->xnf_rx_ring.req_prod_pvt);
   1912 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
   1913 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
   1914 
   1915 	reqp->id = bdesc->id = hang_ix;
   1916 	reqp->gref = bdesc->grant_ref;
   1917 
   1918 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
   1919 	xnfp->xnf_rx_ring.req_prod_pvt++;
   1920 
   1921 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
   1922 }
   1923 
   1924 /*
   1925  * Collect packets from the RX ring, storing them in `xnfp' for later
   1926  * use.
   1927  */
   1928 static void
   1929 xnf_rx_collect(xnf_t *xnfp)
   1930 {
   1931 	mblk_t *head, *tail;
   1932 
   1933 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
   1934 
   1935 	/*
   1936 	 * Loop over unconsumed responses:
   1937 	 * 1. get a response
   1938 	 * 2. take corresponding buffer off recv. ring
   1939 	 * 3. indicate this by setting slot to NULL
   1940 	 * 4. create a new message and
   1941 	 * 5. copy data in, adjust ptr
   1942 	 */
   1943 
   1944 	head = tail = NULL;
   1945 
   1946 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
   1947 		netif_rx_response_t *rxpkt;
   1948 		xnf_buf_t *bdesc;
   1949 		ssize_t len;
   1950 		size_t off;
   1951 		mblk_t *mp = NULL;
   1952 		boolean_t hwcsum = B_FALSE;
   1953 		grant_ref_t ref;
   1954 
   1955 		/* 1. */
   1956 		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
   1957 		    xnfp->xnf_rx_ring.rsp_cons);
   1958 
   1959 		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
   1960 		    int, (int)rxpkt->offset,
   1961 		    int, (int)rxpkt->flags,
   1962 		    int, (int)rxpkt->status);
   1963 
   1964 		/*
   1965 		 * 2.
   1966 		 */
   1967 		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
   1968 
   1969 		/*
   1970 		 * 3.
   1971 		 */
   1972 		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
   1973 		ASSERT(bdesc->id == rxpkt->id);
   1974 
   1975 		ref = bdesc->grant_ref;
   1976 		off = rxpkt->offset;
   1977 		len = rxpkt->status;
   1978 
   1979 		if (!xnfp->xnf_running) {
   1980 			DTRACE_PROBE4(xnf_rx_not_running,
   1981 			    int, rxpkt->status,
   1982 			    char *, bdesc->buf, int, rxpkt->offset,
   1983 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
   1984 
   1985 			xnfp->xnf_stat_drop++;
   1986 
   1987 		} else if (len <= 0) {
   1988 			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
   1989 			    int, rxpkt->status,
   1990 			    char *, bdesc->buf, int, rxpkt->offset,
   1991 			    char *, ((char *)bdesc->buf) + rxpkt->offset);
   1992 
   1993 			xnfp->xnf_stat_errrx++;
   1994 
   1995 			switch (len) {
   1996 			case 0:
   1997 				xnfp->xnf_stat_runt++;
   1998 				break;
   1999 			case NETIF_RSP_ERROR:
   2000 				xnfp->xnf_stat_mac_rcv_error++;
   2001 				break;
   2002 			case NETIF_RSP_DROPPED:
   2003 				xnfp->xnf_stat_norxbuf++;
   2004 				break;
   2005 			}
   2006 
   2007 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
   2008 			cmn_err(CE_WARN, "Bad rx grant reference %d "
   2009 			    "from domain %d", ref,
   2010 			    xvdi_get_oeid(xnfp->xnf_devinfo));
   2011 
   2012 		} else if ((off + len) > PAGESIZE) {
   2013 			cmn_err(CE_WARN, "Rx packet overflows page "
   2014 			    "(offset %ld, length %ld) from domain %d",
   2015 			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
   2016 		} else {
   2017 			xnf_buf_t *nbuf = NULL;
   2018 
   2019 			DTRACE_PROBE4(xnf_rx_packet, int, len,
   2020 			    char *, bdesc->buf, int, off,
   2021 			    char *, ((char *)bdesc->buf) + off);
   2022 
   2023 			ASSERT(off + len <= PAGEOFFSET);
   2024 
   2025 			if (rxpkt->flags & NETRXF_data_validated)
   2026 				hwcsum = B_TRUE;
   2027 
   2028 			/*
   2029 			 * If the packet is below a pre-determined
   2030 			 * size we will copy data out rather than
   2031 			 * replace it.
   2032 			 */
   2033 			if (len > xnf_rx_copy_limit)
   2034 				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
   2035 
   2036 			/*
   2037 			 * If we have a replacement buffer, attempt to
   2038 			 * wrap the existing one with an mblk_t in
   2039 			 * order that the upper layers of the stack
   2040 			 * might use it directly.
   2041 			 */
   2042 			if (nbuf != NULL) {
   2043 				mp = desballoc((unsigned char *)bdesc->buf,
   2044 				    bdesc->len, 0, &bdesc->free_rtn);
   2045 				if (mp == NULL) {
   2046 					xnfp->xnf_stat_rx_desballoc_fail++;
   2047 					xnfp->xnf_stat_norxbuf++;
   2048 
   2049 					xnf_buf_put(xnfp, nbuf, B_FALSE);
   2050 					nbuf = NULL;
   2051 				} else {
   2052 					mp->b_rptr = mp->b_rptr + off;
   2053 					mp->b_wptr = mp->b_rptr + len;
   2054 
   2055 					/*
   2056 					 * Release the grant reference
   2057 					 * associated with this buffer
   2058 					 * - they are scarce and the
   2059 					 * upper layers of the stack
   2060 					 * don't need it.
   2061 					 */
   2062 					(void) gnttab_end_foreign_access_ref(
   2063 					    bdesc->grant_ref, 0);
   2064 					gref_put(xnfp, bdesc->grant_ref);
   2065 					bdesc->grant_ref = INVALID_GRANT_REF;
   2066 
   2067 					bdesc = nbuf;
   2068 				}
   2069 			}
   2070 
   2071 			if (nbuf == NULL) {
   2072 				/*
   2073 				 * No replacement buffer allocated -
   2074 				 * attempt to copy the data out and
   2075 				 * re-hang the existing buffer.
   2076 				 */
   2077 
   2078 				/* 4. */
   2079 				mp = allocb(len, BPRI_MED);
   2080 				if (mp == NULL) {
   2081 					xnfp->xnf_stat_rx_allocb_fail++;
   2082 					xnfp->xnf_stat_norxbuf++;
   2083 				} else {
   2084 					/* 5. */
   2085 					bcopy(bdesc->buf + off, mp->b_wptr,
   2086 					    len);
   2087 					mp->b_wptr += len;
   2088 				}
   2089 			}
   2090 		}
   2091 
   2092 		/* Re-hang the buffer. */
   2093 		xnf_rxbuf_hang(xnfp, bdesc);
   2094 
   2095 		if (mp != NULL) {
   2096 			if (hwcsum) {
   2097 				/*
   2098 				 * If the peer says that the data has
   2099 				 * been validated then we declare that
   2100 				 * the full checksum has been
   2101 				 * verified.
   2102 				 *
   2103 				 * We don't look at the "checksum
   2104 				 * blank" flag, and hence could have a
   2105 				 * packet here that we are asserting
   2106 				 * is good with a blank checksum.
   2107 				 *
   2108 				 * The hardware checksum offload
   2109 				 * specification says that we must
   2110 				 * provide the actual checksum as well
   2111 				 * as an assertion that it is valid,
   2112 				 * but the protocol stack doesn't
   2113 				 * actually use it and some other
   2114 				 * drivers don't bother, so we don't.
   2115 				 * If it was necessary we could grovel
   2116 				 * in the packet to find it.
   2117 				 */
   2118 				(void) hcksum_assoc(mp, NULL,
   2119 				    NULL, 0, 0, 0, 0,
   2120 				    HCK_FULLCKSUM |
   2121 				    HCK_FULLCKSUM_OK, 0);
   2122 				xnfp->xnf_stat_rx_cksum_no_need++;
   2123 			}
   2124 			if (head == NULL) {
   2125 				ASSERT(tail == NULL);
   2126 
   2127 				head = mp;
   2128 			} else {
   2129 				ASSERT(tail != NULL);
   2130 
   2131 				tail->b_next = mp;
   2132 			}
   2133 			tail = mp;
   2134 
   2135 			ASSERT(mp->b_next == NULL);
   2136 
   2137 			xnfp->xnf_stat_ipackets++;
   2138 			xnfp->xnf_stat_rbytes += len;
   2139 		}
   2140 
   2141 		xnfp->xnf_rx_ring.rsp_cons++;
   2142 	}
   2143 
   2144 	/*
   2145 	 * Store the mblks we have collected.
   2146 	 */
   2147 	if (head != NULL) {
   2148 		ASSERT(tail != NULL);
   2149 
   2150 		if (xnfp->xnf_rx_head == NULL) {
   2151 			ASSERT(xnfp->xnf_rx_tail == NULL);
   2152 
   2153 			xnfp->xnf_rx_head = head;
   2154 		} else {
   2155 			ASSERT(xnfp->xnf_rx_tail != NULL);
   2156 
   2157 			xnfp->xnf_rx_tail->b_next = head;
   2158 		}
   2159 		xnfp->xnf_rx_tail = tail;
   2160 	}
   2161 }
   2162 
   2163 /*
   2164  *  xnf_alloc_dma_resources() -- initialize the drivers structures
   2165  */
   2166 static int
   2167 xnf_alloc_dma_resources(xnf_t *xnfp)
   2168 {
   2169 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
   2170 	size_t			len;
   2171 	ddi_dma_cookie_t	dma_cookie;
   2172 	uint_t			ncookies;
   2173 	int			rc;
   2174 	caddr_t			rptr;
   2175 
   2176 	/*
   2177 	 * The code below allocates all the DMA data structures that
   2178 	 * need to be released when the driver is detached.
   2179 	 *
   2180 	 * Allocate page for the transmit descriptor ring.
   2181 	 */
   2182 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
   2183 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
   2184 		goto alloc_error;
   2185 
   2186 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
   2187 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
   2188 	    DDI_DMA_SLEEP, 0, &rptr, &len,
   2189 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
   2190 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
   2191 		xnfp->xnf_tx_ring_dma_handle = NULL;
   2192 		goto alloc_error;
   2193 	}
   2194 
   2195 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
   2196 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
   2197 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
   2198 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
   2199 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
   2200 		xnfp->xnf_tx_ring_dma_handle = NULL;
   2201 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
   2202 		if (rc == DDI_DMA_NORESOURCES)
   2203 			goto alloc_error;
   2204 		else
   2205 			goto error;
   2206 	}
   2207 
   2208 	ASSERT(ncookies == 1);
   2209 	bzero(rptr, PAGESIZE);
   2210 	/* LINTED: constant in conditional context */
   2211 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
   2212 	/* LINTED: constant in conditional context */
   2213 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
   2214 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
   2215 
   2216 	/*
   2217 	 * Allocate page for the receive descriptor ring.
   2218 	 */
   2219 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
   2220 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
   2221 		goto alloc_error;
   2222 
   2223 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
   2224 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
   2225 	    DDI_DMA_SLEEP, 0, &rptr, &len,
   2226 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
   2227 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
   2228 		xnfp->xnf_rx_ring_dma_handle = NULL;
   2229 		goto alloc_error;
   2230 	}
   2231 
   2232 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
   2233 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
   2234 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
   2235 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
   2236 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
   2237 		xnfp->xnf_rx_ring_dma_handle = NULL;
   2238 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
   2239 		if (rc == DDI_DMA_NORESOURCES)
   2240 			goto alloc_error;
   2241 		else
   2242 			goto error;
   2243 	}
   2244 
   2245 	ASSERT(ncookies == 1);
   2246 	bzero(rptr, PAGESIZE);
   2247 	/* LINTED: constant in conditional context */
   2248 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
   2249 	/* LINTED: constant in conditional context */
   2250 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
   2251 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
   2252 
   2253 	return (DDI_SUCCESS);
   2254 
   2255 alloc_error:
   2256 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
   2257 	    ddi_get_instance(xnfp->xnf_devinfo));
   2258 error:
   2259 	xnf_release_dma_resources(xnfp);
   2260 	return (DDI_FAILURE);
   2261 }
   2262 
   2263 /*
   2264  * Release all DMA resources in the opposite order from acquisition
   2265  */
   2266 static void
   2267 xnf_release_dma_resources(xnf_t *xnfp)
   2268 {
   2269 	int i;
   2270 
   2271 	/*
   2272 	 * Free receive buffers which are currently associated with
   2273 	 * descriptors.
   2274 	 */
   2275 	mutex_enter(&xnfp->xnf_rxlock);
   2276 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
   2277 		xnf_buf_t *bp;
   2278 
   2279 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
   2280 			continue;
   2281 		xnfp->xnf_rx_pkt_info[i] = NULL;
   2282 		xnf_buf_put(xnfp, bp, B_FALSE);
   2283 	}
   2284 	mutex_exit(&xnfp->xnf_rxlock);
   2285 
   2286 	/* Free the receive ring buffer. */
   2287 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
   2288 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
   2289 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
   2290 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
   2291 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
   2292 	}
   2293 	/* Free the transmit ring buffer. */
   2294 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
   2295 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
   2296 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
   2297 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
   2298 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
   2299 	}
   2300 
   2301 }
   2302 
   2303 /*
   2304  * Release any packets and associated structures used by the TX ring.
   2305  */
   2306 static void
   2307 xnf_release_mblks(xnf_t *xnfp)
   2308 {
   2309 	RING_IDX i;
   2310 	xnf_txid_t *tidp;
   2311 
   2312 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
   2313 	    i < NET_TX_RING_SIZE;
   2314 	    i++, tidp++) {
   2315 		xnf_txbuf_t *txp = tidp->txbuf;
   2316 
   2317 		if (txp != NULL) {
   2318 			ASSERT(txp->tx_mp != NULL);
   2319 			freemsg(txp->tx_mp);
   2320 
   2321 			txid_put(xnfp, tidp);
   2322 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
   2323 		}
   2324 	}
   2325 }
   2326 
   2327 static int
   2328 xnf_buf_constructor(void *buf, void *arg, int kmflag)
   2329 {
   2330 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
   2331 	xnf_buf_t *bdesc = buf;
   2332 	xnf_t *xnfp = arg;
   2333 	ddi_dma_cookie_t dma_cookie;
   2334 	uint_t ncookies;
   2335 	size_t len;
   2336 
   2337 	if (kmflag & KM_NOSLEEP)
   2338 		ddiflags = DDI_DMA_DONTWAIT;
   2339 
   2340 	/* Allocate a DMA access handle for the buffer. */
   2341 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
   2342 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
   2343 		goto failure;
   2344 
   2345 	/* Allocate DMA-able memory for buffer. */
   2346 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
   2347 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
   2348 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
   2349 		goto failure_1;
   2350 
   2351 	/* Bind to virtual address of buffer to get physical address. */
   2352 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
   2353 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
   2354 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
   2355 		goto failure_2;
   2356 	ASSERT(ncookies == 1);
   2357 
   2358 	bdesc->free_rtn.free_func = xnf_buf_recycle;
   2359 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
   2360 	bdesc->xnfp = xnfp;
   2361 	bdesc->buf_phys = dma_cookie.dmac_laddress;
   2362 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
   2363 	bdesc->len = dma_cookie.dmac_size;
   2364 	bdesc->grant_ref = INVALID_GRANT_REF;
   2365 	bdesc->gen = xnfp->xnf_gen;
   2366 
   2367 	atomic_add_64(&xnfp->xnf_stat_buf_allocated, 1);
   2368 
   2369 	return (0);
   2370 
   2371 failure_2:
   2372 	ddi_dma_mem_free(&bdesc->acc_handle);
   2373 
   2374 failure_1:
   2375 	ddi_dma_free_handle(&bdesc->dma_handle);
   2376 
   2377 failure:
   2378 
   2379 	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
   2380 	return (-1);
   2381 }
   2382 
   2383 static void
   2384 xnf_buf_destructor(void *buf, void *arg)
   2385 {
   2386 	xnf_buf_t *bdesc = buf;
   2387 	xnf_t *xnfp = arg;
   2388 
   2389 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
   2390 	ddi_dma_mem_free(&bdesc->acc_handle);
   2391 	ddi_dma_free_handle(&bdesc->dma_handle);
   2392 
   2393 	atomic_add_64(&xnfp->xnf_stat_buf_allocated, -1);
   2394 }
   2395 
   2396 static xnf_buf_t *
   2397 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
   2398 {
   2399 	grant_ref_t gref;
   2400 	xnf_buf_t *bufp;
   2401 
   2402 	/*
   2403 	 * Usually grant references are more scarce than memory, so we
   2404 	 * attempt to acquire a grant reference first.
   2405 	 */
   2406 	gref = gref_get(xnfp);
   2407 	if (gref == INVALID_GRANT_REF)
   2408 		return (NULL);
   2409 
   2410 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
   2411 	if (bufp == NULL) {
   2412 		gref_put(xnfp, gref);
   2413 		return (NULL);
   2414 	}
   2415 
   2416 	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
   2417 
   2418 	bufp->grant_ref = gref;
   2419 
   2420 	if (bufp->gen != xnfp->xnf_gen)
   2421 		xnf_buf_refresh(bufp);
   2422 
   2423 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
   2424 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
   2425 	    bufp->buf_mfn, readonly ? 1 : 0);
   2426 
   2427 	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, 1);
   2428 
   2429 	return (bufp);
   2430 }
   2431 
   2432 static void
   2433 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
   2434 {
   2435 	if (bufp->grant_ref != INVALID_GRANT_REF) {
   2436 		(void) gnttab_end_foreign_access_ref(
   2437 		    bufp->grant_ref, readonly ? 1 : 0);
   2438 		gref_put(xnfp, bufp->grant_ref);
   2439 		bufp->grant_ref = INVALID_GRANT_REF;
   2440 	}
   2441 
   2442 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
   2443 
   2444 	atomic_add_64(&xnfp->xnf_stat_buf_outstanding, -1);
   2445 }
   2446 
   2447 /*
   2448  * Refresh any cached data about a buffer after resume.
   2449  */
   2450 static void
   2451 xnf_buf_refresh(xnf_buf_t *bdesc)
   2452 {
   2453 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
   2454 	bdesc->gen = bdesc->xnfp->xnf_gen;
   2455 }
   2456 
   2457 /*
   2458  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
   2459  * look-aside buffers.
   2460  */
   2461 static void
   2462 xnf_buf_recycle(xnf_buf_t *bdesc)
   2463 {
   2464 	xnf_t *xnfp = bdesc->xnfp;
   2465 
   2466 	xnf_buf_put(xnfp, bdesc, B_TRUE);
   2467 }
   2468 
   2469 static int
   2470 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
   2471 {
   2472 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
   2473 	xnf_txbuf_t *txp = buf;
   2474 	xnf_t *xnfp = arg;
   2475 
   2476 	if (kmflag & KM_NOSLEEP)
   2477 		ddiflags = DDI_DMA_DONTWAIT;
   2478 
   2479 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
   2480 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
   2481 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
   2482 		return (-1);
   2483 	}
   2484 
   2485 	return (0);
   2486 }
   2487 
   2488 static void
   2489 xnf_tx_buf_destructor(void *buf, void *arg)
   2490 {
   2491 	_NOTE(ARGUNUSED(arg));
   2492 	xnf_txbuf_t *txp = buf;
   2493 
   2494 	ddi_dma_free_handle(&txp->tx_dma_handle);
   2495 }
   2496 
   2497 /*
   2498  * Statistics.
   2499  */
   2500 static char *xnf_aux_statistics[] = {
   2501 	"tx_cksum_deferred",
   2502 	"rx_cksum_no_need",
   2503 	"interrupts",
   2504 	"unclaimed_interrupts",
   2505 	"tx_pullup",
   2506 	"tx_pagebndry",
   2507 	"tx_attempt",
   2508 	"buf_allocated",
   2509 	"buf_outstanding",
   2510 	"gref_outstanding",
   2511 	"gref_failure",
   2512 	"gref_peak",
   2513 	"rx_allocb_fail",
   2514 	"rx_desballoc_fail",
   2515 };
   2516 
   2517 static int
   2518 xnf_kstat_aux_update(kstat_t *ksp, int flag)
   2519 {
   2520 	xnf_t *xnfp;
   2521 	kstat_named_t *knp;
   2522 
   2523 	if (flag != KSTAT_READ)
   2524 		return (EACCES);
   2525 
   2526 	xnfp = ksp->ks_private;
   2527 	knp = ksp->ks_data;
   2528 
   2529 	/*
   2530 	 * Assignment order must match that of the names in
   2531 	 * xnf_aux_statistics.
   2532 	 */
   2533 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
   2534 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
   2535 
   2536 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
   2537 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
   2538 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
   2539 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
   2540 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
   2541 
   2542 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
   2543 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
   2544 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
   2545 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
   2546 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
   2547 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
   2548 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
   2549 
   2550 	return (0);
   2551 }
   2552 
   2553 static boolean_t
   2554 xnf_kstat_init(xnf_t *xnfp)
   2555 {
   2556 	int nstat = sizeof (xnf_aux_statistics) /
   2557 	    sizeof (xnf_aux_statistics[0]);
   2558 	char **cp = xnf_aux_statistics;
   2559 	kstat_named_t *knp;
   2560 
   2561 	/*
   2562 	 * Create and initialise kstats.
   2563 	 */
   2564 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
   2565 	    ddi_get_instance(xnfp->xnf_devinfo),
   2566 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
   2567 	    nstat, 0)) == NULL)
   2568 		return (B_FALSE);
   2569 
   2570 	xnfp->xnf_kstat_aux->ks_private = xnfp;
   2571 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
   2572 
   2573 	knp = xnfp->xnf_kstat_aux->ks_data;
   2574 	while (nstat > 0) {
   2575 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
   2576 
   2577 		knp++;
   2578 		cp++;
   2579 		nstat--;
   2580 	}
   2581 
   2582 	kstat_install(xnfp->xnf_kstat_aux);
   2583 
   2584 	return (B_TRUE);
   2585 }
   2586 
   2587 static int
   2588 xnf_stat(void *arg, uint_t stat, uint64_t *val)
   2589 {
   2590 	xnf_t *xnfp = arg;
   2591 
   2592 	mutex_enter(&xnfp->xnf_rxlock);
   2593 	mutex_enter(&xnfp->xnf_txlock);
   2594 
   2595 #define	mac_stat(q, r)				\
   2596 	case (MAC_STAT_##q):			\
   2597 		*val = xnfp->xnf_stat_##r;	\
   2598 		break
   2599 
   2600 #define	ether_stat(q, r)			\
   2601 	case (ETHER_STAT_##q):			\
   2602 		*val = xnfp->xnf_stat_##r;	\
   2603 		break
   2604 
   2605 	switch (stat) {
   2606 
   2607 	mac_stat(IPACKETS, ipackets);
   2608 	mac_stat(OPACKETS, opackets);
   2609 	mac_stat(RBYTES, rbytes);
   2610 	mac_stat(OBYTES, obytes);
   2611 	mac_stat(NORCVBUF, norxbuf);
   2612 	mac_stat(IERRORS, errrx);
   2613 	mac_stat(NOXMTBUF, tx_defer);
   2614 
   2615 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
   2616 	ether_stat(TOOSHORT_ERRORS, runt);
   2617 
   2618 	/* always claim to be in full duplex mode */
   2619 	case ETHER_STAT_LINK_DUPLEX:
   2620 		*val = LINK_DUPLEX_FULL;
   2621 		break;
   2622 
   2623 	/* always claim to be at 1Gb/s link speed */
   2624 	case MAC_STAT_IFSPEED:
   2625 		*val = 1000000000ull;
   2626 		break;
   2627 
   2628 	default:
   2629 		mutex_exit(&xnfp->xnf_txlock);
   2630 		mutex_exit(&xnfp->xnf_rxlock);
   2631 
   2632 		return (ENOTSUP);
   2633 	}
   2634 
   2635 #undef mac_stat
   2636 #undef ether_stat
   2637 
   2638 	mutex_exit(&xnfp->xnf_txlock);
   2639 	mutex_exit(&xnfp->xnf_rxlock);
   2640 
   2641 	return (0);
   2642 }
   2643 
   2644 static boolean_t
   2645 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
   2646 {
   2647 	_NOTE(ARGUNUSED(arg));
   2648 
   2649 	switch (cap) {
   2650 	case MAC_CAPAB_HCKSUM: {
   2651 		uint32_t *capab = cap_data;
   2652 
   2653 		/*
   2654 		 * Whilst the flag used to communicate with the IO
   2655 		 * domain is called "NETTXF_csum_blank", the checksum
   2656 		 * in the packet must contain the pseudo-header
   2657 		 * checksum and not zero.
   2658 		 *
   2659 		 * To help out the IO domain, we might use
   2660 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
   2661 		 * then use checksum offload for IPv6 packets, which
   2662 		 * the IO domain can't handle.
   2663 		 *
   2664 		 * As a result, we declare outselves capable of
   2665 		 * HCKSUM_INET_FULL_V4. This means that we receive
   2666 		 * IPv4 packets from the stack with a blank checksum
   2667 		 * field and must insert the pseudo-header checksum
   2668 		 * before passing the packet to the IO domain.
   2669 		 */
   2670 		*capab = HCKSUM_INET_FULL_V4;
   2671 		break;
   2672 	}
   2673 	default:
   2674 		return (B_FALSE);
   2675 	}
   2676 
   2677 	return (B_TRUE);
   2678 }
   2679 
   2680 /*
   2681  * The state of the peer has changed - react accordingly.
   2682  */
   2683 static void
   2684 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
   2685     void *arg, void *impl_data)
   2686 {
   2687 	_NOTE(ARGUNUSED(id, arg));
   2688 	xnf_t *xnfp = ddi_get_driver_private(dip);
   2689 	XenbusState new_state = *(XenbusState *)impl_data;
   2690 
   2691 	ASSERT(xnfp != NULL);
   2692 
   2693 	switch (new_state) {
   2694 	case XenbusStateUnknown:
   2695 	case XenbusStateInitialising:
   2696 	case XenbusStateInitialised:
   2697 	case XenbusStateClosing:
   2698 	case XenbusStateClosed:
   2699 	case XenbusStateReconfiguring:
   2700 	case XenbusStateReconfigured:
   2701 		break;
   2702 
   2703 	case XenbusStateInitWait:
   2704 		xnf_read_config(xnfp);
   2705 
   2706 		if (!xnfp->xnf_be_rx_copy) {
   2707 			cmn_err(CE_WARN,
   2708 			    "The xnf driver requires a dom0 that "
   2709 			    "supports 'feature-rx-copy'.");
   2710 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
   2711 			    XBT_NULL, XenbusStateClosed);
   2712 			break;
   2713 		}
   2714 
   2715 		/*
   2716 		 * Connect to the backend.
   2717 		 */
   2718 		xnf_be_connect(xnfp);
   2719 
   2720 		/*
   2721 		 * Our MAC address as discovered by xnf_read_config().
   2722 		 */
   2723 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
   2724 
   2725 		break;
   2726 
   2727 	case XenbusStateConnected:
   2728 		mutex_enter(&xnfp->xnf_rxlock);
   2729 		mutex_enter(&xnfp->xnf_txlock);
   2730 
   2731 		xnfp->xnf_connected = B_TRUE;
   2732 		/*
   2733 		 * Wake up any threads waiting to send data to
   2734 		 * backend.
   2735 		 */
   2736 		cv_broadcast(&xnfp->xnf_cv_state);
   2737 
   2738 		mutex_exit(&xnfp->xnf_txlock);
   2739 		mutex_exit(&xnfp->xnf_rxlock);
   2740 
   2741 		/*
   2742 		 * Kick the peer in case it missed any transmits
   2743 		 * request in the TX ring.
   2744 		 */
   2745 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
   2746 
   2747 		/*
   2748 		 * There may already be completed receive requests in
   2749 		 * the ring sent by backend after it gets connected
   2750 		 * but before we see its state change here, so we call
   2751 		 * xnf_intr() to handle them, if any.
   2752 		 */
   2753 		(void) xnf_intr((caddr_t)xnfp);
   2754 
   2755 		/*
   2756 		 * Mark the link up now that we are connected.
   2757 		 */
   2758 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
   2759 
   2760 		/*
   2761 		 * Tell the backend about the multicast addresses in
   2762 		 * which we are interested.
   2763 		 */
   2764 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
   2765 
   2766 		break;
   2767 
   2768 	default:
   2769 		break;
   2770 	}
   2771 }
   2772