Home | History | Annotate | Download | only in ixgbe
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * Copyright(c) 2007-2009 Intel Corporation. All rights reserved.
      5  * The contents of this file are subject to the terms of the
      6  * Common Development and Distribution License (the "License").
      7  * You may not use this file except in compliance with the License.
      8  *
      9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10  * or http://www.opensolaris.org/os/licensing.
     11  * See the License for the specific language governing permissions
     12  * and limitations under the License.
     13  *
     14  * When distributing Covered Code, include this CDDL HEADER in each
     15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16  * If applicable, add the following below this CDDL HEADER, with the
     17  * fields enclosed by brackets "[]" replaced with your own identifying
     18  * information: Portions Copyright [yyyy] [name of copyright owner]
     19  *
     20  * CDDL HEADER END
     21  */
     22 
     23 /*
     24  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     25  * Use is subject to license terms.
     26  */
     27 
     28 #include "ixgbe_sw.h"
     29 
     30 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
     31     uint32_t, boolean_t);
     32 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *,
     33     uint32_t);
     34 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
     35     ixgbe_tx_context_t *, size_t);
     36 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
     37 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *);
     38 
     39 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
     40 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
     41     ixgbe_tx_context_t *);
     42 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
     43     ixgbe_tx_context_t *);
     44 
     45 #ifndef IXGBE_DEBUG
     46 #pragma inline(ixgbe_save_desc)
     47 #pragma inline(ixgbe_get_context)
     48 #pragma inline(ixgbe_check_context)
     49 #pragma inline(ixgbe_fill_context)
     50 #endif
     51 
     52 /*
     53  * ixgbe_ring_tx
     54  *
     55  * To transmit one mblk through one specified ring.
     56  *
     57  * One mblk can consist of several fragments, each fragment
     58  * will be processed with different methods based on the size.
     59  * For the fragments with size less than the bcopy threshold,
     60  * they will be processed by using bcopy; otherwise, they will
     61  * be processed by using DMA binding.
     62  *
     63  * To process the mblk, a tx control block is got from the
     64  * free list. One tx control block contains one tx buffer, which
     65  * is used to copy mblk fragments' data; and one tx DMA handle,
     66  * which is used to bind a mblk fragment with DMA resource.
     67  *
     68  * Several small mblk fragments can be copied into one tx control
     69  * block's buffer, and then the buffer will be transmitted with
     70  * one tx descriptor.
     71  *
     72  * A large fragment only binds with one tx control block's DMA
     73  * handle, and it can span several tx descriptors for transmitting.
     74  *
     75  * So to transmit a packet (mblk), several tx control blocks can
     76  * be used. After the processing, those tx control blocks will
     77  * be put to the work list.
     78  */
     79 mblk_t *
     80 ixgbe_ring_tx(void *arg, mblk_t *mp)
     81 {
     82 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
     83 	ixgbe_t *ixgbe = tx_ring->ixgbe;
     84 	tx_type_t current_flag, next_flag;
     85 	uint32_t current_len, next_len;
     86 	uint32_t desc_total;
     87 	size_t mbsize;
     88 	int desc_num;
     89 	boolean_t copy_done, eop;
     90 	mblk_t *current_mp, *next_mp, *nmp, *pull_mp = NULL;
     91 	tx_control_block_t *tcb;
     92 	ixgbe_tx_context_t tx_context, *ctx;
     93 	link_list_t pending_list;
     94 	uint32_t len, hdr_frag_len, hdr_len;
     95 	uint32_t copy_thresh;
     96 	mblk_t *hdr_new_mp = NULL;
     97 	mblk_t *hdr_pre_mp = NULL;
     98 	mblk_t *hdr_nmp = NULL;
     99 
    100 	ASSERT(mp->b_next == NULL);
    101 
    102 	copy_thresh = ixgbe->tx_copy_thresh;
    103 
    104 	/* Get the mblk size */
    105 	mbsize = 0;
    106 	for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
    107 		mbsize += MBLKL(nmp);
    108 	}
    109 
    110 	if (ixgbe->tx_hcksum_enable) {
    111 		/*
    112 		 * Retrieve checksum context information from the mblk
    113 		 * that will be used to decide whether/how to fill the
    114 		 * context descriptor.
    115 		 */
    116 		ctx = &tx_context;
    117 		if (ixgbe_get_context(mp, ctx) < 0) {
    118 			freemsg(mp);
    119 			return (NULL);
    120 		}
    121 
    122 		/*
    123 		 * If the mblk size exceeds the max size ixgbe could
    124 		 * process, then discard this mblk, and return NULL.
    125 		 */
    126 		if ((ctx->lso_flag &&
    127 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
    128 		    (!ctx->lso_flag &&
    129 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
    130 			freemsg(mp);
    131 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
    132 			return (NULL);
    133 		}
    134 	} else {
    135 		ctx = NULL;
    136 	}
    137 
    138 	/*
    139 	 * Check and recycle tx descriptors.
    140 	 * The recycle threshold here should be selected carefully
    141 	 */
    142 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
    143 		tx_ring->tx_recycle(tx_ring);
    144 	}
    145 
    146 	/*
    147 	 * After the recycling, if the tbd_free is less than the
    148 	 * overload_threshold, assert overload, return mp;
    149 	 * and we need to re-schedule the tx again.
    150 	 */
    151 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
    152 		tx_ring->reschedule = B_TRUE;
    153 		IXGBE_DEBUG_STAT(tx_ring->stat_overload);
    154 		return (mp);
    155 	}
    156 
    157 	/*
    158 	 * The pending_list is a linked list that is used to save
    159 	 * the tx control blocks that have packet data processed
    160 	 * but have not put the data to the tx descriptor ring.
    161 	 * It is used to reduce the lock contention of the tx_lock.
    162 	 */
    163 	LINK_LIST_INIT(&pending_list);
    164 	desc_num = 0;
    165 	desc_total = 0;
    166 
    167 	/*
    168 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
    169 	 * to be within one descriptor. Here we reallocate and refill the
    170 	 * the header if it's physical memory non-contiguous.
    171 	 */
    172 	if ((ctx != NULL) && ctx->lso_flag) {
    173 		/* find the last fragment of the header */
    174 		len = MBLKL(mp);
    175 		ASSERT(len > 0);
    176 		hdr_nmp = mp;
    177 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
    178 		while (len < hdr_len) {
    179 			hdr_pre_mp = hdr_nmp;
    180 			hdr_nmp = hdr_nmp->b_cont;
    181 			len += MBLKL(hdr_nmp);
    182 		}
    183 		/*
    184 		 * If the header and the payload are in different mblks,
    185 		 * we simply force the header to be copied into pre-allocated
    186 		 * page-aligned buffer.
    187 		 */
    188 		if (len == hdr_len)
    189 			goto adjust_threshold;
    190 
    191 		hdr_frag_len = hdr_len - (len - MBLKL(hdr_nmp));
    192 		/*
    193 		 * There are two cases we need to reallocate a mblk for the
    194 		 * last header fragment:
    195 		 * 1. the header is in multiple mblks and the last fragment
    196 		 * share the same mblk with the payload
    197 		 * 2. the header is in a single mblk shared with the payload
    198 		 * and the header is physical memory non-contiguous
    199 		 */
    200 		if ((hdr_nmp != mp) ||
    201 		    (P2NPHASE((uintptr_t)hdr_nmp->b_rptr, ixgbe->sys_page_size)
    202 		    < hdr_len)) {
    203 			IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail);
    204 			/*
    205 			 * reallocate the mblk for the last header fragment,
    206 			 * expect to bcopy into pre-allocated page-aligned
    207 			 * buffer
    208 			 */
    209 			hdr_new_mp = allocb(hdr_frag_len, NULL);
    210 			if (!hdr_new_mp)
    211 				return (mp);
    212 			bcopy(hdr_nmp->b_rptr, hdr_new_mp->b_rptr,
    213 			    hdr_frag_len);
    214 			/* link the new header fragment with the other parts */
    215 			hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
    216 			hdr_new_mp->b_cont = hdr_nmp;
    217 			if (hdr_pre_mp)
    218 				hdr_pre_mp->b_cont = hdr_new_mp;
    219 			else
    220 				mp = hdr_new_mp;
    221 			hdr_nmp->b_rptr += hdr_frag_len;
    222 		}
    223 adjust_threshold:
    224 		/*
    225 		 * adjust the bcopy threshhold to guarantee
    226 		 * the header to use bcopy way
    227 		 */
    228 		if (copy_thresh < hdr_len)
    229 			copy_thresh = hdr_len;
    230 	}
    231 
    232 	current_mp = mp;
    233 	current_len = MBLKL(current_mp);
    234 	/*
    235 	 * Decide which method to use for the first fragment
    236 	 */
    237 	current_flag = (current_len <= copy_thresh) ?
    238 	    USE_COPY : USE_DMA;
    239 	/*
    240 	 * If the mblk includes several contiguous small fragments,
    241 	 * they may be copied into one buffer. This flag is used to
    242 	 * indicate whether there are pending fragments that need to
    243 	 * be copied to the current tx buffer.
    244 	 *
    245 	 * If this flag is B_TRUE, it indicates that a new tx control
    246 	 * block is needed to process the next fragment using either
    247 	 * copy or DMA binding.
    248 	 *
    249 	 * Otherwise, it indicates that the next fragment will be
    250 	 * copied to the current tx buffer that is maintained by the
    251 	 * current tx control block. No new tx control block is needed.
    252 	 */
    253 	copy_done = B_TRUE;
    254 	while (current_mp) {
    255 		next_mp = current_mp->b_cont;
    256 		eop = (next_mp == NULL); /* Last fragment of the packet? */
    257 		next_len = eop ? 0: MBLKL(next_mp);
    258 
    259 		/*
    260 		 * When the current fragment is an empty fragment, if
    261 		 * the next fragment will still be copied to the current
    262 		 * tx buffer, we cannot skip this fragment here. Because
    263 		 * the copy processing is pending for completion. We have
    264 		 * to process this empty fragment in the tx_copy routine.
    265 		 *
    266 		 * If the copy processing is completed or a DMA binding
    267 		 * processing is just completed, we can just skip this
    268 		 * empty fragment.
    269 		 */
    270 		if ((current_len == 0) && (copy_done)) {
    271 			current_mp = next_mp;
    272 			current_len = next_len;
    273 			current_flag = (current_len <= copy_thresh) ?
    274 			    USE_COPY : USE_DMA;
    275 			continue;
    276 		}
    277 
    278 		if (copy_done) {
    279 			/*
    280 			 * Get a new tx control block from the free list
    281 			 */
    282 			tcb = ixgbe_get_free_list(tx_ring);
    283 
    284 			if (tcb == NULL) {
    285 				IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
    286 				goto tx_failure;
    287 			}
    288 
    289 			/*
    290 			 * Push the tx control block to the pending list
    291 			 * to avoid using lock too early
    292 			 */
    293 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
    294 		}
    295 
    296 		if (current_flag == USE_COPY) {
    297 			/*
    298 			 * Check whether to use bcopy or DMA binding to process
    299 			 * the next fragment, and if using bcopy, whether we
    300 			 * need to continue copying the next fragment into the
    301 			 * current tx buffer.
    302 			 */
    303 			ASSERT((tcb->tx_buf.len + current_len) <=
    304 			    tcb->tx_buf.size);
    305 
    306 			if (eop) {
    307 				/*
    308 				 * This is the last fragment of the packet, so
    309 				 * the copy processing will be completed with
    310 				 * this fragment.
    311 				 */
    312 				next_flag = USE_NONE;
    313 				copy_done = B_TRUE;
    314 			} else if ((tcb->tx_buf.len + current_len + next_len) >
    315 			    tcb->tx_buf.size) {
    316 				/*
    317 				 * If the next fragment is too large to be
    318 				 * copied to the current tx buffer, we need
    319 				 * to complete the current copy processing.
    320 				 */
    321 				next_flag = (next_len > copy_thresh) ?
    322 				    USE_DMA: USE_COPY;
    323 				copy_done = B_TRUE;
    324 			} else if (next_len > copy_thresh) {
    325 				/*
    326 				 * The next fragment needs to be processed with
    327 				 * DMA binding. So the copy prcessing will be
    328 				 * completed with the current fragment.
    329 				 */
    330 				next_flag = USE_DMA;
    331 				copy_done = B_TRUE;
    332 			} else {
    333 				/*
    334 				 * Continue to copy the next fragment to the
    335 				 * current tx buffer.
    336 				 */
    337 				next_flag = USE_COPY;
    338 				copy_done = B_FALSE;
    339 			}
    340 
    341 			desc_num = ixgbe_tx_copy(tx_ring, tcb, current_mp,
    342 			    current_len, copy_done);
    343 		} else {
    344 			/*
    345 			 * Check whether to use bcopy or DMA binding to process
    346 			 * the next fragment.
    347 			 */
    348 			next_flag = (next_len > copy_thresh) ?
    349 			    USE_DMA: USE_COPY;
    350 			ASSERT(copy_done == B_TRUE);
    351 
    352 			desc_num = ixgbe_tx_bind(tx_ring, tcb, current_mp,
    353 			    current_len);
    354 		}
    355 
    356 		if (desc_num > 0)
    357 			desc_total += desc_num;
    358 		else if (desc_num < 0)
    359 			goto tx_failure;
    360 
    361 		current_mp = next_mp;
    362 		current_len = next_len;
    363 		current_flag = next_flag;
    364 	}
    365 
    366 	/*
    367 	 * Attach the mblk to the last tx control block
    368 	 */
    369 	ASSERT(tcb);
    370 	ASSERT(tcb->mp == NULL);
    371 	tcb->mp = mp;
    372 
    373 	/*
    374 	 * 82598/82599 chipset has a limitation that no more than 32 tx
    375 	 * descriptors can be transmited out at one time.
    376 	 *
    377 	 * Here is a workaround for it: pull up the mblk then send it
    378 	 * out with bind way. By doing so, no more than MAX_COOKIE (18)
    379 	 * descriptors is needed.
    380 	 */
    381 	if (desc_total + 1 > IXGBE_TX_DESC_LIMIT) {
    382 		IXGBE_DEBUG_STAT(tx_ring->stat_break_tbd_limit);
    383 
    384 		/*
    385 		 * Discard the mblk and free the used resources
    386 		 */
    387 		tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
    388 		while (tcb) {
    389 			tcb->mp = NULL;
    390 			ixgbe_free_tcb(tcb);
    391 			tcb = (tx_control_block_t *)
    392 			    LIST_GET_NEXT(&pending_list, &tcb->link);
    393 		}
    394 
    395 		/*
    396 		 * Return the tx control blocks in the pending list to
    397 		 * the free list.
    398 		 */
    399 		ixgbe_put_free_list(tx_ring, &pending_list);
    400 
    401 		/*
    402 		 * pull up the mblk and send it out with bind way
    403 		 */
    404 		if ((pull_mp = msgpullup(mp, -1)) == NULL) {
    405 			tx_ring->reschedule = B_TRUE;
    406 
    407 			/*
    408 			 * If new mblk has been allocted for the last header
    409 			 * fragment of a LSO packet, we should restore the
    410 			 * modified mp.
    411 			 */
    412 			if (hdr_new_mp) {
    413 				hdr_new_mp->b_cont = NULL;
    414 				freeb(hdr_new_mp);
    415 				hdr_nmp->b_rptr -= hdr_frag_len;
    416 				if (hdr_pre_mp)
    417 					hdr_pre_mp->b_cont = hdr_nmp;
    418 				else
    419 					mp = hdr_nmp;
    420 			}
    421 			return (mp);
    422 		}
    423 
    424 		LINK_LIST_INIT(&pending_list);
    425 		desc_total = 0;
    426 
    427 		/*
    428 		 * if the packet is a LSO packet, we simply
    429 		 * transmit the header in one descriptor using the copy way
    430 		 */
    431 		if ((ctx != NULL) && ctx->lso_flag) {
    432 			hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len +
    433 			    ctx->l4_hdr_len;
    434 
    435 			tcb = ixgbe_get_free_list(tx_ring);
    436 			if (tcb == NULL) {
    437 				IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
    438 				goto tx_failure;
    439 			}
    440 			desc_num = ixgbe_tx_copy(tx_ring, tcb, pull_mp,
    441 			    hdr_len, B_TRUE);
    442 			LIST_PUSH_TAIL(&pending_list, &tcb->link);
    443 			desc_total  += desc_num;
    444 
    445 			pull_mp->b_rptr += hdr_len;
    446 		}
    447 
    448 		tcb = ixgbe_get_free_list(tx_ring);
    449 		if (tcb == NULL) {
    450 			IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
    451 			goto tx_failure;
    452 		}
    453 		if ((ctx != NULL) && ctx->lso_flag) {
    454 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
    455 			    mbsize - hdr_len);
    456 		} else {
    457 			desc_num = ixgbe_tx_bind(tx_ring, tcb, pull_mp,
    458 			    mbsize);
    459 		}
    460 		if (desc_num < 0) {
    461 			goto tx_failure;
    462 		}
    463 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
    464 
    465 		desc_total += desc_num;
    466 		tcb->mp = pull_mp;
    467 	}
    468 
    469 	/*
    470 	 * Before fill the tx descriptor ring with the data, we need to
    471 	 * ensure there are adequate free descriptors for transmit
    472 	 * (including one context descriptor).
    473 	 */
    474 	if (tx_ring->tbd_free < (desc_total + 1)) {
    475 		tx_ring->tx_recycle(tx_ring);
    476 	}
    477 
    478 	mutex_enter(&tx_ring->tx_lock);
    479 	/*
    480 	 * If the number of free tx descriptors is not enough for transmit
    481 	 * then return mp.
    482 	 *
    483 	 * Note: we must put this check under the mutex protection to
    484 	 * ensure the correctness when multiple threads access it in
    485 	 * parallel.
    486 	 */
    487 	if (tx_ring->tbd_free < (desc_total + 1)) {
    488 		IXGBE_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
    489 		mutex_exit(&tx_ring->tx_lock);
    490 		goto tx_failure;
    491 	}
    492 
    493 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
    494 	    mbsize);
    495 
    496 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
    497 
    498 	mutex_exit(&tx_ring->tx_lock);
    499 
    500 	/*
    501 	 * now that the transmission succeeds, need to free the original
    502 	 * mp if we used the pulling up mblk for transmission.
    503 	 */
    504 	if (pull_mp) {
    505 		freemsg(mp);
    506 	}
    507 
    508 	return (NULL);
    509 
    510 tx_failure:
    511 	/*
    512 	 * If transmission fails, need to free the pulling up mblk.
    513 	 */
    514 	if (pull_mp) {
    515 		freemsg(pull_mp);
    516 	}
    517 
    518 	/*
    519 	 * If new mblk has been allocted for the last header
    520 	 * fragment of a LSO packet, we should restore the
    521 	 * modified mp.
    522 	 */
    523 	if (hdr_new_mp) {
    524 		hdr_new_mp->b_cont = NULL;
    525 		freeb(hdr_new_mp);
    526 		hdr_nmp->b_rptr -= hdr_frag_len;
    527 		if (hdr_pre_mp)
    528 			hdr_pre_mp->b_cont = hdr_nmp;
    529 		else
    530 			mp = hdr_nmp;
    531 	}
    532 	/*
    533 	 * Discard the mblk and free the used resources
    534 	 */
    535 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
    536 	while (tcb) {
    537 		tcb->mp = NULL;
    538 
    539 		ixgbe_free_tcb(tcb);
    540 
    541 		tcb = (tx_control_block_t *)
    542 		    LIST_GET_NEXT(&pending_list, &tcb->link);
    543 	}
    544 
    545 	/*
    546 	 * Return the tx control blocks in the pending list to the free list.
    547 	 */
    548 	ixgbe_put_free_list(tx_ring, &pending_list);
    549 
    550 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
    551 	tx_ring->reschedule = B_TRUE;
    552 
    553 	return (mp);
    554 }
    555 
    556 /*
    557  * ixgbe_tx_copy
    558  *
    559  * Copy the mblk fragment to the pre-allocated tx buffer
    560  */
    561 static int
    562 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
    563     uint32_t len, boolean_t copy_done)
    564 {
    565 	dma_buffer_t *tx_buf;
    566 	uint32_t desc_num;
    567 	_NOTE(ARGUNUSED(tx_ring));
    568 
    569 	tx_buf = &tcb->tx_buf;
    570 
    571 	/*
    572 	 * Copy the packet data of the mblk fragment into the
    573 	 * pre-allocated tx buffer, which is maintained by the
    574 	 * tx control block.
    575 	 *
    576 	 * Several mblk fragments can be copied into one tx buffer.
    577 	 * The destination address of the current copied fragment in
    578 	 * the tx buffer is next to the end of the previous copied
    579 	 * fragment.
    580 	 */
    581 	if (len > 0) {
    582 		bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);
    583 
    584 		tx_buf->len += len;
    585 		tcb->frag_num++;
    586 	}
    587 
    588 	desc_num = 0;
    589 
    590 	/*
    591 	 * If it is the last fragment copied to the current tx buffer,
    592 	 * in other words, if there's no remaining fragment or the remaining
    593 	 * fragment requires a new tx control block to process, we need to
    594 	 * complete the current copy processing by syncing up the current
    595 	 * DMA buffer and saving the descriptor data.
    596 	 */
    597 	if (copy_done) {
    598 		/*
    599 		 * Sync the DMA buffer of the packet data
    600 		 */
    601 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
    602 
    603 		tcb->tx_type = USE_COPY;
    604 
    605 		/*
    606 		 * Save the address and length to the private data structure
    607 		 * of the tx control block, which will be used to fill the
    608 		 * tx descriptor ring after all the fragments are processed.
    609 		 */
    610 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
    611 		desc_num++;
    612 	}
    613 
    614 	return (desc_num);
    615 }
    616 
    617 /*
    618  * ixgbe_tx_bind
    619  *
    620  * Bind the mblk fragment with DMA
    621  */
    622 static int
    623 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
    624     uint32_t len)
    625 {
    626 	int status, i;
    627 	ddi_dma_cookie_t dma_cookie;
    628 	uint_t ncookies;
    629 	int desc_num;
    630 
    631 	/*
    632 	 * Use DMA binding to process the mblk fragment
    633 	 */
    634 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
    635 	    (caddr_t)mp->b_rptr, len,
    636 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
    637 	    0, &dma_cookie, &ncookies);
    638 
    639 	if (status != DDI_DMA_MAPPED) {
    640 		IXGBE_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
    641 		return (-1);
    642 	}
    643 
    644 	tcb->frag_num++;
    645 	tcb->tx_type = USE_DMA;
    646 	/*
    647 	 * Each fragment can span several cookies. One cookie will have
    648 	 * one tx descriptor to transmit.
    649 	 */
    650 	desc_num = 0;
    651 	for (i = ncookies; i > 0; i--) {
    652 		/*
    653 		 * Save the address and length to the private data structure
    654 		 * of the tx control block, which will be used to fill the
    655 		 * tx descriptor ring after all the fragments are processed.
    656 		 */
    657 		ixgbe_save_desc(tcb,
    658 		    dma_cookie.dmac_laddress,
    659 		    dma_cookie.dmac_size);
    660 
    661 		desc_num++;
    662 
    663 		if (i > 1)
    664 			ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
    665 	}
    666 
    667 	return (desc_num);
    668 }
    669 
    670 /*
    671  * ixgbe_get_context
    672  *
    673  * Get the context information from the mblk
    674  */
    675 static int
    676 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
    677 {
    678 	uint32_t start;
    679 	uint32_t hckflags;
    680 	uint32_t lsoflags;
    681 	uint32_t mss;
    682 	uint32_t len;
    683 	uint32_t size;
    684 	uint32_t offset;
    685 	unsigned char *pos;
    686 	ushort_t etype;
    687 	uint32_t mac_hdr_len;
    688 	uint32_t l4_proto;
    689 	uint32_t l4_hdr_len;
    690 
    691 	ASSERT(mp != NULL);
    692 
    693 	hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &hckflags);
    694 	bzero(ctx, sizeof (ixgbe_tx_context_t));
    695 
    696 	if (hckflags == 0) {
    697 		return (0);
    698 	}
    699 
    700 	ctx->hcksum_flags = hckflags;
    701 
    702 	lso_info_get(mp, &mss, &lsoflags);
    703 	ctx->mss = mss;
    704 	ctx->lso_flag = (lsoflags == HW_LSO);
    705 
    706 	/*
    707 	 * LSO relies on tx h/w checksum, so here will drop the package
    708 	 * if h/w checksum flag is not declared.
    709 	 */
    710 	if (ctx->lso_flag) {
    711 		if (!((ctx->hcksum_flags & HCK_PARTIALCKSUM) &&
    712 		    (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM))) {
    713 			IXGBE_DEBUGLOG_0(NULL, "ixgbe_tx: h/w "
    714 			    "checksum flags are not specified when doing LSO");
    715 			return (-1);
    716 		}
    717 	}
    718 
    719 	etype = 0;
    720 	mac_hdr_len = 0;
    721 	l4_proto = 0;
    722 
    723 	/*
    724 	 * Firstly get the position of the ether_type/ether_tpid.
    725 	 * Here we don't assume the ether (VLAN) header is fully included
    726 	 * in one mblk fragment, so we go thourgh the fragments to parse
    727 	 * the ether type.
    728 	 */
    729 	size = len = MBLKL(mp);
    730 	offset = offsetof(struct ether_header, ether_type);
    731 	while (size <= offset) {
    732 		mp = mp->b_cont;
    733 		ASSERT(mp != NULL);
    734 		len = MBLKL(mp);
    735 		size += len;
    736 	}
    737 	pos = mp->b_rptr + offset + len - size;
    738 
    739 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
    740 	if (etype == ETHERTYPE_VLAN) {
    741 		/*
    742 		 * Get the position of the ether_type in VLAN header
    743 		 */
    744 		offset = offsetof(struct ether_vlan_header, ether_type);
    745 		while (size <= offset) {
    746 			mp = mp->b_cont;
    747 			ASSERT(mp != NULL);
    748 			len = MBLKL(mp);
    749 			size += len;
    750 		}
    751 		pos = mp->b_rptr + offset + len - size;
    752 
    753 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
    754 		mac_hdr_len = sizeof (struct ether_vlan_header);
    755 	} else {
    756 		mac_hdr_len = sizeof (struct ether_header);
    757 	}
    758 
    759 	/*
    760 	 * Here we don't assume the IP(V6) header is fully included in
    761 	 * one mblk fragment.
    762 	 */
    763 	switch (etype) {
    764 	case ETHERTYPE_IP:
    765 		if (ctx->lso_flag) {
    766 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
    767 			while (size <= offset) {
    768 				mp = mp->b_cont;
    769 				ASSERT(mp != NULL);
    770 				len = MBLKL(mp);
    771 				size += len;
    772 			}
    773 			pos = mp->b_rptr + offset + len - size;
    774 			*((uint16_t *)(uintptr_t)(pos)) = 0;
    775 
    776 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
    777 			    mac_hdr_len;
    778 			while (size <= offset) {
    779 				mp = mp->b_cont;
    780 				ASSERT(mp != NULL);
    781 				len = MBLKL(mp);
    782 				size += len;
    783 			}
    784 			pos = mp->b_rptr + offset + len - size;
    785 			*((uint16_t *)(uintptr_t)(pos)) = 0;
    786 
    787 			/*
    788 			 * To perform ixgbe LSO, here also need to fill
    789 			 * the tcp checksum field of the packet with the
    790 			 * following pseudo-header checksum:
    791 			 * (ip_source_addr, ip_destination_addr, l4_proto)
    792 			 * Currently the tcp/ip stack has done it.
    793 			 */
    794 		}
    795 
    796 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
    797 		while (size <= offset) {
    798 			mp = mp->b_cont;
    799 			ASSERT(mp != NULL);
    800 			len = MBLKL(mp);
    801 			size += len;
    802 		}
    803 		pos = mp->b_rptr + offset + len - size;
    804 
    805 		l4_proto = *(uint8_t *)pos;
    806 		break;
    807 	case ETHERTYPE_IPV6:
    808 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
    809 		while (size <= offset) {
    810 			mp = mp->b_cont;
    811 			ASSERT(mp != NULL);
    812 			len = MBLKL(mp);
    813 			size += len;
    814 		}
    815 		pos = mp->b_rptr + offset + len - size;
    816 
    817 		l4_proto = *(uint8_t *)pos;
    818 		break;
    819 	default:
    820 		/* Unrecoverable error */
    821 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
    822 		return (-2);
    823 	}
    824 
    825 	if (ctx->lso_flag) {
    826 		offset = mac_hdr_len + start;
    827 		while (size <= offset) {
    828 			mp = mp->b_cont;
    829 			ASSERT(mp != NULL);
    830 			len = MBLKL(mp);
    831 			size += len;
    832 		}
    833 		pos = mp->b_rptr + offset + len - size;
    834 
    835 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
    836 	} else {
    837 		/*
    838 		 * l4 header length is only required for LSO
    839 		 */
    840 		l4_hdr_len = 0;
    841 	}
    842 
    843 	ctx->mac_hdr_len = mac_hdr_len;
    844 	ctx->ip_hdr_len = start;
    845 	ctx->l4_proto = l4_proto;
    846 	ctx->l4_hdr_len = l4_hdr_len;
    847 
    848 	return (0);
    849 }
    850 
    851 /*
    852  * ixgbe_check_context
    853  *
    854  * Check if a new context descriptor is needed
    855  */
    856 static boolean_t
    857 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
    858 {
    859 	ixgbe_tx_context_t *last;
    860 
    861 	if (ctx == NULL)
    862 		return (B_FALSE);
    863 
    864 	/*
    865 	 * Compare the context data retrieved from the mblk and the
    866 	 * stored data of the last context descriptor. The data need
    867 	 * to be checked are:
    868 	 *	hcksum_flags
    869 	 *	l4_proto
    870 	 *	mac_hdr_len
    871 	 *	ip_hdr_len
    872 	 *	lso_flag
    873 	 *	mss (only checked for LSO)
    874 	 *	l4_hr_len (only checked for LSO)
    875 	 * Either one of the above data is changed, a new context descriptor
    876 	 * will be needed.
    877 	 */
    878 	last = &tx_ring->tx_context;
    879 
    880 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
    881 	    (ctx->l4_proto != last->l4_proto) ||
    882 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
    883 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
    884 	    (ctx->lso_flag != last->lso_flag) ||
    885 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
    886 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
    887 		return (B_TRUE);
    888 	}
    889 
    890 	return (B_FALSE);
    891 }
    892 
    893 /*
    894  * ixgbe_fill_context
    895  *
    896  * Fill the context descriptor with hardware checksum informations
    897  */
    898 static void
    899 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
    900     ixgbe_tx_context_t *ctx)
    901 {
    902 	/*
    903 	 * Fill the context descriptor with the checksum
    904 	 * context information we've got.
    905 	 */
    906 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
    907 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
    908 	    IXGBE_ADVTXD_MACLEN_SHIFT;
    909 
    910 	ctx_tbd->type_tucmd_mlhl =
    911 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
    912 
    913 	if (ctx->hcksum_flags & HCK_IPV4_HDRCKSUM)
    914 		ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
    915 
    916 	if (ctx->hcksum_flags & HCK_PARTIALCKSUM) {
    917 		switch (ctx->l4_proto) {
    918 		case IPPROTO_TCP:
    919 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
    920 			break;
    921 		case IPPROTO_UDP:
    922 			/*
    923 			 * We don't have to explicitly set:
    924 			 *	ctx_tbd->type_tucmd_mlhl |=
    925 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
    926 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
    927 			 */
    928 			break;
    929 		default:
    930 			/* Unrecoverable error */
    931 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
    932 			break;
    933 		}
    934 	}
    935 
    936 	ctx_tbd->seqnum_seed = 0;
    937 
    938 	if (ctx->lso_flag) {
    939 		ctx_tbd->mss_l4len_idx =
    940 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
    941 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
    942 	} else {
    943 		ctx_tbd->mss_l4len_idx = 0;
    944 	}
    945 }
    946 
    947 /*
    948  * ixgbe_tx_fill_ring
    949  *
    950  * Fill the tx descriptor ring with the data
    951  */
    952 static int
    953 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
    954     ixgbe_tx_context_t *ctx, size_t mbsize)
    955 {
    956 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
    957 	boolean_t load_context;
    958 	uint32_t index, tcb_index, desc_num;
    959 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
    960 	tx_control_block_t *tcb, *first_tcb;
    961 	uint32_t hcksum_flags;
    962 	int i;
    963 
    964 	ASSERT(mutex_owned(&tx_ring->tx_lock));
    965 
    966 	tbd = NULL;
    967 	first_tbd = NULL;
    968 	first_tcb = NULL;
    969 	desc_num = 0;
    970 	hcksum_flags = 0;
    971 	load_context = B_FALSE;
    972 
    973 	/*
    974 	 * Get the index of the first tx descriptor that will be filled,
    975 	 * and the index of the first work list item that will be attached
    976 	 * with the first used tx control block in the pending list.
    977 	 * Note: the two indexes are the same.
    978 	 */
    979 	index = tx_ring->tbd_tail;
    980 	tcb_index = tx_ring->tbd_tail;
    981 
    982 	if (ctx != NULL) {
    983 		hcksum_flags = ctx->hcksum_flags;
    984 
    985 		/*
    986 		 * Check if a new context descriptor is needed for this packet
    987 		 */
    988 		load_context = ixgbe_check_context(tx_ring, ctx);
    989 
    990 		if (load_context) {
    991 			tbd = &tx_ring->tbd_ring[index];
    992 
    993 			/*
    994 			 * Fill the context descriptor with the
    995 			 * hardware checksum offload informations.
    996 			 */
    997 			ixgbe_fill_context(
    998 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
    999 
   1000 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
   1001 			desc_num++;
   1002 
   1003 			/*
   1004 			 * Store the checksum context data if
   1005 			 * a new context descriptor is added
   1006 			 */
   1007 			tx_ring->tx_context = *ctx;
   1008 		}
   1009 	}
   1010 
   1011 	first_tbd = &tx_ring->tbd_ring[index];
   1012 
   1013 	/*
   1014 	 * Fill tx data descriptors with the data saved in the pending list.
   1015 	 * The tx control blocks in the pending list are added to the work list
   1016 	 * at the same time.
   1017 	 *
   1018 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
   1019 	 * One item of the work list corresponds to one tx descriptor. Because
   1020 	 * one tx control block can span multiple tx descriptors, the tx
   1021 	 * control block will be added to the first work list item that
   1022 	 * corresponds to the first tx descriptor generated from that tx
   1023 	 * control block.
   1024 	 */
   1025 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
   1026 	first_tcb = tcb;
   1027 	while (tcb != NULL) {
   1028 
   1029 		for (i = 0; i < tcb->desc_num; i++) {
   1030 			tbd = &tx_ring->tbd_ring[index];
   1031 
   1032 			tbd->read.buffer_addr = tcb->desc[i].address;
   1033 			tbd->read.cmd_type_len = tcb->desc[i].length;
   1034 
   1035 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
   1036 			    | IXGBE_ADVTXD_DTYP_DATA;
   1037 
   1038 			tbd->read.olinfo_status = 0;
   1039 
   1040 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
   1041 			desc_num++;
   1042 		}
   1043 
   1044 		/*
   1045 		 * Add the tx control block to the work list
   1046 		 */
   1047 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
   1048 		tx_ring->work_list[tcb_index] = tcb;
   1049 
   1050 		tcb_index = index;
   1051 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
   1052 	}
   1053 
   1054 	if (load_context) {
   1055 		/*
   1056 		 * Count the context descriptor for
   1057 		 * the first tx control block.
   1058 		 */
   1059 		first_tcb->desc_num++;
   1060 	}
   1061 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
   1062 
   1063 	/*
   1064 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
   1065 	 * valid in the first descriptor of the packet.
   1066 	 * Setting paylen in every first_tbd for all parts.
   1067 	 * 82599 requires the packet length in paylen field with or without
   1068 	 * LSO and 82598 will ignore it in non-LSO mode.
   1069 	 */
   1070 	ASSERT(first_tbd != NULL);
   1071 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
   1072 
   1073 	switch (hw->mac.type) {
   1074 	case ixgbe_mac_82599EB:
   1075 		if (ctx != NULL && ctx->lso_flag) {
   1076 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
   1077 			first_tbd->read.olinfo_status |=
   1078 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
   1079 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
   1080 		} else {
   1081 			first_tbd->read.olinfo_status |=
   1082 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
   1083 		}
   1084 		break;
   1085 	case ixgbe_mac_82598EB:
   1086 		if (ctx != NULL && ctx->lso_flag) {
   1087 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
   1088 			first_tbd->read.olinfo_status |=
   1089 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
   1090 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
   1091 		}
   1092 		break;
   1093 	default:
   1094 		break;
   1095 	}
   1096 
   1097 	/* Set hardware checksum bits */
   1098 	if (hcksum_flags != 0) {
   1099 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
   1100 			first_tbd->read.olinfo_status |=
   1101 			    IXGBE_ADVTXD_POPTS_IXSM;
   1102 		if (hcksum_flags & HCK_PARTIALCKSUM)
   1103 			first_tbd->read.olinfo_status |=
   1104 			    IXGBE_ADVTXD_POPTS_TXSM;
   1105 	}
   1106 
   1107 	/*
   1108 	 * The last descriptor of packet needs End Of Packet (EOP),
   1109 	 * and Report Status (RS) bits set
   1110 	 */
   1111 	ASSERT(tbd != NULL);
   1112 	tbd->read.cmd_type_len |=
   1113 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
   1114 
   1115 	/*
   1116 	 * Sync the DMA buffer of the tx descriptor ring
   1117 	 */
   1118 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
   1119 
   1120 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
   1121 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
   1122 		    DDI_SERVICE_DEGRADED);
   1123 	}
   1124 
   1125 	/*
   1126 	 * Update the number of the free tx descriptors.
   1127 	 * The mutual exclusion between the transmission and the recycling
   1128 	 * (for the tx descriptor ring and the work list) is implemented
   1129 	 * with the atomic operation on the number of the free tx descriptors.
   1130 	 *
   1131 	 * Note: we should always decrement the counter tbd_free before
   1132 	 * advancing the hardware TDT pointer to avoid the race condition -
   1133 	 * before the counter tbd_free is decremented, the transmit of the
   1134 	 * tx descriptors has done and the counter tbd_free is increased by
   1135 	 * the tx recycling.
   1136 	 */
   1137 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
   1138 	ASSERT(i >= 0);
   1139 
   1140 	tx_ring->tbd_tail = index;
   1141 
   1142 	/*
   1143 	 * Advance the hardware TDT pointer of the tx descriptor ring
   1144 	 */
   1145 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
   1146 
   1147 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
   1148 	    DDI_FM_OK) {
   1149 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
   1150 		    DDI_SERVICE_DEGRADED);
   1151 	}
   1152 
   1153 	return (desc_num);
   1154 }
   1155 
   1156 /*
   1157  * ixgbe_save_desc
   1158  *
   1159  * Save the address/length pair to the private array
   1160  * of the tx control block. The address/length pairs
   1161  * will be filled into the tx descriptor ring later.
   1162  */
   1163 static void
   1164 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
   1165 {
   1166 	sw_desc_t *desc;
   1167 
   1168 	desc = &tcb->desc[tcb->desc_num];
   1169 	desc->address = address;
   1170 	desc->length = length;
   1171 
   1172 	tcb->desc_num++;
   1173 }
   1174 
   1175 /*
   1176  * ixgbe_tx_recycle_legacy
   1177  *
   1178  * Recycle the tx descriptors and tx control blocks.
   1179  *
   1180  * The work list is traversed to check if the corresponding
   1181  * tx descriptors have been transmitted. If so, the resources
   1182  * bound to the tx control blocks will be freed, and those
   1183  * tx control blocks will be returned to the free list.
   1184  */
   1185 uint32_t
   1186 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
   1187 {
   1188 	uint32_t index, last_index, prev_index;
   1189 	int desc_num;
   1190 	boolean_t desc_done;
   1191 	tx_control_block_t *tcb;
   1192 	link_list_t pending_list;
   1193 	ixgbe_t *ixgbe = tx_ring->ixgbe;
   1194 
   1195 	mutex_enter(&tx_ring->recycle_lock);
   1196 
   1197 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
   1198 
   1199 	if (tx_ring->tbd_free == tx_ring->ring_size) {
   1200 		tx_ring->recycle_fail = 0;
   1201 		tx_ring->stall_watchdog = 0;
   1202 		if (tx_ring->reschedule) {
   1203 			tx_ring->reschedule = B_FALSE;
   1204 			mac_tx_ring_update(ixgbe->mac_hdl,
   1205 			    tx_ring->ring_handle);
   1206 		}
   1207 		mutex_exit(&tx_ring->recycle_lock);
   1208 		return (0);
   1209 	}
   1210 
   1211 	/*
   1212 	 * Sync the DMA buffer of the tx descriptor ring
   1213 	 */
   1214 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
   1215 
   1216 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
   1217 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
   1218 	}
   1219 
   1220 	LINK_LIST_INIT(&pending_list);
   1221 	desc_num = 0;
   1222 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
   1223 
   1224 	tcb = tx_ring->work_list[index];
   1225 	ASSERT(tcb != NULL);
   1226 
   1227 	while (tcb != NULL) {
   1228 		/*
   1229 		 * Get the last tx descriptor of this packet.
   1230 		 * If the last tx descriptor is done, then
   1231 		 * we can recycle all descriptors of a packet
   1232 		 * which usually includes several tx control blocks.
   1233 		 * For 82599, LSO descriptors can not be recycled
   1234 		 * unless the whole packet's transmission is done.
   1235 		 * That's why packet level recycling is used here.
   1236 		 * For 82598, there's not such limit.
   1237 		 */
   1238 		last_index = tcb->last_index;
   1239 		/*
   1240 		 * MAX_TX_RING_SIZE is used to judge whether
   1241 		 * the index is a valid value or not.
   1242 		 */
   1243 		if (last_index == MAX_TX_RING_SIZE)
   1244 			break;
   1245 
   1246 		/*
   1247 		 * Check if the Descriptor Done bit is set
   1248 		 */
   1249 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
   1250 		    IXGBE_TXD_STAT_DD;
   1251 		if (desc_done) {
   1252 			/*
   1253 			 * recycle all descriptors of the packet
   1254 			 */
   1255 			while (tcb != NULL) {
   1256 				/*
   1257 				 * Strip off the tx control block from
   1258 				 * the work list, and add it to the
   1259 				 * pending list.
   1260 				 */
   1261 				tx_ring->work_list[index] = NULL;
   1262 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
   1263 
   1264 				/*
   1265 				 * Count the total number of the tx
   1266 				 * descriptors recycled
   1267 				 */
   1268 				desc_num += tcb->desc_num;
   1269 
   1270 				index = NEXT_INDEX(index, tcb->desc_num,
   1271 				    tx_ring->ring_size);
   1272 
   1273 				tcb = tx_ring->work_list[index];
   1274 
   1275 				prev_index = PREV_INDEX(index, 1,
   1276 				    tx_ring->ring_size);
   1277 				if (prev_index == last_index)
   1278 					break;
   1279 			}
   1280 		} else {
   1281 			break;
   1282 		}
   1283 	}
   1284 
   1285 	/*
   1286 	 * If no tx descriptors are recycled, no need to do more processing
   1287 	 */
   1288 	if (desc_num == 0) {
   1289 		tx_ring->recycle_fail++;
   1290 		mutex_exit(&tx_ring->recycle_lock);
   1291 		return (0);
   1292 	}
   1293 
   1294 	tx_ring->recycle_fail = 0;
   1295 	tx_ring->stall_watchdog = 0;
   1296 
   1297 	/*
   1298 	 * Update the head index of the tx descriptor ring
   1299 	 */
   1300 	tx_ring->tbd_head = index;
   1301 
   1302 	/*
   1303 	 * Update the number of the free tx descriptors with atomic operations
   1304 	 */
   1305 	atomic_add_32(&tx_ring->tbd_free, desc_num);
   1306 
   1307 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
   1308 	    (tx_ring->reschedule)) {
   1309 		tx_ring->reschedule = B_FALSE;
   1310 		mac_tx_ring_update(ixgbe->mac_hdl,
   1311 		    tx_ring->ring_handle);
   1312 	}
   1313 	mutex_exit(&tx_ring->recycle_lock);
   1314 
   1315 	/*
   1316 	 * Free the resources used by the tx control blocks
   1317 	 * in the pending list
   1318 	 */
   1319 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
   1320 	while (tcb != NULL) {
   1321 		/*
   1322 		 * Release the resources occupied by the tx control block
   1323 		 */
   1324 		ixgbe_free_tcb(tcb);
   1325 
   1326 		tcb = (tx_control_block_t *)
   1327 		    LIST_GET_NEXT(&pending_list, &tcb->link);
   1328 	}
   1329 
   1330 	/*
   1331 	 * Add the tx control blocks in the pending list to the free list.
   1332 	 */
   1333 	ixgbe_put_free_list(tx_ring, &pending_list);
   1334 
   1335 	return (desc_num);
   1336 }
   1337 
   1338 /*
   1339  * ixgbe_tx_recycle_head_wb
   1340  *
   1341  * Check the head write-back, and recycle all the transmitted
   1342  * tx descriptors and tx control blocks.
   1343  */
   1344 uint32_t
   1345 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
   1346 {
   1347 	uint32_t index;
   1348 	uint32_t head_wb;
   1349 	int desc_num;
   1350 	tx_control_block_t *tcb;
   1351 	link_list_t pending_list;
   1352 	ixgbe_t *ixgbe = tx_ring->ixgbe;
   1353 
   1354 	mutex_enter(&tx_ring->recycle_lock);
   1355 
   1356 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
   1357 
   1358 	if (tx_ring->tbd_free == tx_ring->ring_size) {
   1359 		tx_ring->recycle_fail = 0;
   1360 		tx_ring->stall_watchdog = 0;
   1361 		if (tx_ring->reschedule) {
   1362 			tx_ring->reschedule = B_FALSE;
   1363 			mac_tx_ring_update(ixgbe->mac_hdl,
   1364 			    tx_ring->ring_handle);
   1365 		}
   1366 		mutex_exit(&tx_ring->recycle_lock);
   1367 		return (0);
   1368 	}
   1369 
   1370 	/*
   1371 	 * Sync the DMA buffer of the tx descriptor ring
   1372 	 *
   1373 	 * Note: For head write-back mode, the tx descriptors will not
   1374 	 * be written back, but the head write-back value is stored at
   1375 	 * the last extra tbd at the end of the DMA area, we still need
   1376 	 * to sync the head write-back value for kernel.
   1377 	 *
   1378 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
   1379 	 */
   1380 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
   1381 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
   1382 	    sizeof (uint32_t),
   1383 	    DDI_DMA_SYNC_FORKERNEL);
   1384 
   1385 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
   1386 		ddi_fm_service_impact(ixgbe->dip,
   1387 		    DDI_SERVICE_DEGRADED);
   1388 	}
   1389 
   1390 	LINK_LIST_INIT(&pending_list);
   1391 	desc_num = 0;
   1392 	index = tx_ring->tbd_head;	/* Next index to clean */
   1393 
   1394 	/*
   1395 	 * Get the value of head write-back
   1396 	 */
   1397 	head_wb = *tx_ring->tbd_head_wb;
   1398 	while (index != head_wb) {
   1399 		tcb = tx_ring->work_list[index];
   1400 		ASSERT(tcb != NULL);
   1401 
   1402 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
   1403 		    tcb->desc_num) {
   1404 			/*
   1405 			 * The current tx control block is not
   1406 			 * completely transmitted, stop recycling
   1407 			 */
   1408 			break;
   1409 		}
   1410 
   1411 		/*
   1412 		 * Strip off the tx control block from the work list,
   1413 		 * and add it to the pending list.
   1414 		 */
   1415 		tx_ring->work_list[index] = NULL;
   1416 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
   1417 
   1418 		/*
   1419 		 * Advance the index of the tx descriptor ring
   1420 		 */
   1421 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
   1422 
   1423 		/*
   1424 		 * Count the total number of the tx descriptors recycled
   1425 		 */
   1426 		desc_num += tcb->desc_num;
   1427 	}
   1428 
   1429 	/*
   1430 	 * If no tx descriptors are recycled, no need to do more processing
   1431 	 */
   1432 	if (desc_num == 0) {
   1433 		tx_ring->recycle_fail++;
   1434 		mutex_exit(&tx_ring->recycle_lock);
   1435 		return (0);
   1436 	}
   1437 
   1438 	tx_ring->recycle_fail = 0;
   1439 	tx_ring->stall_watchdog = 0;
   1440 
   1441 	/*
   1442 	 * Update the head index of the tx descriptor ring
   1443 	 */
   1444 	tx_ring->tbd_head = index;
   1445 
   1446 	/*
   1447 	 * Update the number of the free tx descriptors with atomic operations
   1448 	 */
   1449 	atomic_add_32(&tx_ring->tbd_free, desc_num);
   1450 
   1451 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
   1452 	    (tx_ring->reschedule)) {
   1453 		tx_ring->reschedule = B_FALSE;
   1454 		mac_tx_ring_update(ixgbe->mac_hdl,
   1455 		    tx_ring->ring_handle);
   1456 	}
   1457 	mutex_exit(&tx_ring->recycle_lock);
   1458 
   1459 	/*
   1460 	 * Free the resources used by the tx control blocks
   1461 	 * in the pending list
   1462 	 */
   1463 	tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
   1464 	while (tcb) {
   1465 		/*
   1466 		 * Release the resources occupied by the tx control block
   1467 		 */
   1468 		ixgbe_free_tcb(tcb);
   1469 
   1470 		tcb = (tx_control_block_t *)
   1471 		    LIST_GET_NEXT(&pending_list, &tcb->link);
   1472 	}
   1473 
   1474 	/*
   1475 	 * Add the tx control blocks in the pending list to the free list.
   1476 	 */
   1477 	ixgbe_put_free_list(tx_ring, &pending_list);
   1478 
   1479 	return (desc_num);
   1480 }
   1481 
   1482 /*
   1483  * ixgbe_free_tcb - free up the tx control block
   1484  *
   1485  * Free the resources of the tx control block, including
   1486  * unbind the previously bound DMA handle, and reset other
   1487  * control fields.
   1488  */
   1489 void
   1490 ixgbe_free_tcb(tx_control_block_t *tcb)
   1491 {
   1492 	switch (tcb->tx_type) {
   1493 	case USE_COPY:
   1494 		/*
   1495 		 * Reset the buffer length that is used for copy
   1496 		 */
   1497 		tcb->tx_buf.len = 0;
   1498 		break;
   1499 	case USE_DMA:
   1500 		/*
   1501 		 * Release the DMA resource that is used for
   1502 		 * DMA binding.
   1503 		 */
   1504 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
   1505 		break;
   1506 	default:
   1507 		break;
   1508 	}
   1509 
   1510 	/*
   1511 	 * Free the mblk
   1512 	 */
   1513 	if (tcb->mp != NULL) {
   1514 		freemsg(tcb->mp);
   1515 		tcb->mp = NULL;
   1516 	}
   1517 
   1518 	tcb->tx_type = USE_NONE;
   1519 	tcb->last_index = MAX_TX_RING_SIZE;
   1520 	tcb->frag_num = 0;
   1521 	tcb->desc_num = 0;
   1522 }
   1523 
   1524 /*
   1525  * ixgbe_get_free_list - Get a free tx control block from the free list
   1526  *
   1527  * The atomic operation on the number of the available tx control block
   1528  * in the free list is used to keep this routine mutual exclusive with
   1529  * the routine ixgbe_put_check_list.
   1530  */
   1531 static tx_control_block_t *
   1532 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring)
   1533 {
   1534 	tx_control_block_t *tcb;
   1535 
   1536 	/*
   1537 	 * Check and update the number of the free tx control block
   1538 	 * in the free list.
   1539 	 */
   1540 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
   1541 		return (NULL);
   1542 
   1543 	mutex_enter(&tx_ring->tcb_head_lock);
   1544 
   1545 	tcb = tx_ring->free_list[tx_ring->tcb_head];
   1546 	ASSERT(tcb != NULL);
   1547 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
   1548 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
   1549 	    tx_ring->free_list_size);
   1550 
   1551 	mutex_exit(&tx_ring->tcb_head_lock);
   1552 
   1553 	return (tcb);
   1554 }
   1555 
   1556 /*
   1557  * ixgbe_put_free_list
   1558  *
   1559  * Put a list of used tx control blocks back to the free list
   1560  *
   1561  * A mutex is used here to ensure the serialization. The mutual exclusion
   1562  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
   1563  * the atomic operation on the counter tcb_free.
   1564  */
   1565 void
   1566 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
   1567 {
   1568 	uint32_t index;
   1569 	int tcb_num;
   1570 	tx_control_block_t *tcb;
   1571 
   1572 	mutex_enter(&tx_ring->tcb_tail_lock);
   1573 
   1574 	index = tx_ring->tcb_tail;
   1575 
   1576 	tcb_num = 0;
   1577 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
   1578 	while (tcb != NULL) {
   1579 		ASSERT(tx_ring->free_list[index] == NULL);
   1580 		tx_ring->free_list[index] = tcb;
   1581 
   1582 		tcb_num++;
   1583 
   1584 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
   1585 
   1586 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
   1587 	}
   1588 
   1589 	tx_ring->tcb_tail = index;
   1590 
   1591 	/*
   1592 	 * Update the number of the free tx control block
   1593 	 * in the free list. This operation must be placed
   1594 	 * under the protection of the lock.
   1595 	 */
   1596 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
   1597 
   1598 	mutex_exit(&tx_ring->tcb_tail_lock);
   1599 }
   1600