Home | History | Annotate | Download | only in cpu
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 # ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 #include <sys/param.h>
     29 #include <sys/errno.h>
     30 #include <sys/asm_linkage.h>
     31 #include <sys/vtrace.h>
     32 #include <sys/machthread.h>
     33 #include <sys/clock.h>
     34 #include <sys/asi.h>
     35 #include <sys/fsr.h>
     36 #include <sys/privregs.h>
     37 #include <sys/machasi.h>
     38 #include <sys/niagaraasi.h>
     39 
     40 #if !defined(lint)
     41 #include "assym.h"
     42 #endif	/* lint */
     43 
     44 
     45 /*
     46  * Pseudo-code to aid in understanding the control flow of the
     47  * bcopy/kcopy routine.
     48  *
     49  *	! WARNING : <Register usage convention>
     50  *	! In kcopy() the %o5, holds previous error handler and a flag
     51  *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
     52  *	! The %o5 is not available for any other use.
     53  *
     54  * kcopy():
     55  *	%o5 = curthread->t_lofault;		! save existing handler in %o5
     56  *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
     57  *	curthread->t_lofault = .copyerr;
     58  *	Call bcopy();
     59  *
     60  * bcopy():
     61  * 	if (length < 128)
     62  * 		goto regular_copy;
     63  *
     64  * 	if (!use_hw_bcopy)
     65  * 		goto regular_copy;
     66  *
     67  * 	blockcopy;
     68  *	restore t_lofault handler if came from kcopy();
     69  *
     70  *	regular_copy;
     71  *	restore t_lofault handler if came from kcopy();
     72  *
     73  * In lofault handler:
     74  *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
     75  *	return (errno)
     76  *
     77  */
     78 
     79 /*
     80  * Less then or equal this number of bytes we will always copy byte-for-byte
     81  */
     82 #define	SMALL_LIMIT	7
     83 
     84 /*
     85  * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
     86  * handler was set
     87  */
     88 #define	LOFAULT_SET 2
     89 
     90 /*
     91  * This define is to align data for the unaligned source cases.
     92  * The data1, data2 and data3 is merged into data1 and data2.
     93  * The data3 is preserved for next merge.
     94  */
     95 #define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
     96 	sllx	data1, lshift, data1				;\
     97 	srlx	data2, rshift, tmp				;\
     98 	or	data1, tmp, data1				;\
     99 	sllx	data2, lshift, data2				;\
    100 	srlx	data3, rshift, tmp				;\
    101 	or	data2, tmp, data2
    102 /*
    103  * This macro is to align the data. Basically it merges
    104  * data1 and data2 to form double word.
    105  */
    106 #define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
    107 	sllx	data1, lshift, data1				;\
    108 	srlx	data2, rshift, tmp				;\
    109 	or	data1, tmp, data1
    110 
    111 #if !defined(NIAGARA_IMPL)
    112 /*
    113  * Flags set in the lower bits of the t_lofault address:
    114  * FPUSED_FLAG: The FP registers were in use and must be restored
    115  * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
    116  * COPY_FLAGS: Both of the above
    117  *
    118  * Other flags:
    119  * KPREEMPT_FLAG: kpreempt needs to be called
    120  */
    121 #define	FPUSED_FLAG	1
    122 #define	BCOPY_FLAG	2
    123 #define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
    124 #define	KPREEMPT_FLAG	4
    125 
    126 #define	ALIGN_OFF_1_7			\
    127 	faligndata %d0, %d2, %d48	;\
    128 	faligndata %d2, %d4, %d50	;\
    129 	faligndata %d4, %d6, %d52	;\
    130 	faligndata %d6, %d8, %d54	;\
    131 	faligndata %d8, %d10, %d56	;\
    132 	faligndata %d10, %d12, %d58	;\
    133 	faligndata %d12, %d14, %d60	;\
    134 	faligndata %d14, %d16, %d62
    135 
    136 #define	ALIGN_OFF_8_15			\
    137 	faligndata %d2, %d4, %d48	;\
    138 	faligndata %d4, %d6, %d50	;\
    139 	faligndata %d6, %d8, %d52	;\
    140 	faligndata %d8, %d10, %d54	;\
    141 	faligndata %d10, %d12, %d56	;\
    142 	faligndata %d12, %d14, %d58	;\
    143 	faligndata %d14, %d16, %d60	;\
    144 	faligndata %d16, %d18, %d62
    145 
    146 #define	ALIGN_OFF_16_23			\
    147 	faligndata %d4, %d6, %d48	;\
    148 	faligndata %d6, %d8, %d50	;\
    149 	faligndata %d8, %d10, %d52	;\
    150 	faligndata %d10, %d12, %d54	;\
    151 	faligndata %d12, %d14, %d56	;\
    152 	faligndata %d14, %d16, %d58	;\
    153 	faligndata %d16, %d18, %d60	;\
    154 	faligndata %d18, %d20, %d62
    155 
    156 #define	ALIGN_OFF_24_31			\
    157 	faligndata %d6, %d8, %d48	;\
    158 	faligndata %d8, %d10, %d50	;\
    159 	faligndata %d10, %d12, %d52	;\
    160 	faligndata %d12, %d14, %d54	;\
    161 	faligndata %d14, %d16, %d56	;\
    162 	faligndata %d16, %d18, %d58	;\
    163 	faligndata %d18, %d20, %d60	;\
    164 	faligndata %d20, %d22, %d62
    165 
    166 #define	ALIGN_OFF_32_39			\
    167 	faligndata %d8, %d10, %d48	;\
    168 	faligndata %d10, %d12, %d50	;\
    169 	faligndata %d12, %d14, %d52	;\
    170 	faligndata %d14, %d16, %d54	;\
    171 	faligndata %d16, %d18, %d56	;\
    172 	faligndata %d18, %d20, %d58	;\
    173 	faligndata %d20, %d22, %d60	;\
    174 	faligndata %d22, %d24, %d62
    175 
    176 #define	ALIGN_OFF_40_47			\
    177 	faligndata %d10, %d12, %d48	;\
    178 	faligndata %d12, %d14, %d50	;\
    179 	faligndata %d14, %d16, %d52	;\
    180 	faligndata %d16, %d18, %d54	;\
    181 	faligndata %d18, %d20, %d56	;\
    182 	faligndata %d20, %d22, %d58	;\
    183 	faligndata %d22, %d24, %d60	;\
    184 	faligndata %d24, %d26, %d62
    185 
    186 #define	ALIGN_OFF_48_55			\
    187 	faligndata %d12, %d14, %d48	;\
    188 	faligndata %d14, %d16, %d50	;\
    189 	faligndata %d16, %d18, %d52	;\
    190 	faligndata %d18, %d20, %d54	;\
    191 	faligndata %d20, %d22, %d56	;\
    192 	faligndata %d22, %d24, %d58	;\
    193 	faligndata %d24, %d26, %d60	;\
    194 	faligndata %d26, %d28, %d62
    195 
    196 #define	ALIGN_OFF_56_63			\
    197 	faligndata %d14, %d16, %d48	;\
    198 	faligndata %d16, %d18, %d50	;\
    199 	faligndata %d18, %d20, %d52	;\
    200 	faligndata %d20, %d22, %d54	;\
    201 	faligndata %d22, %d24, %d56	;\
    202 	faligndata %d24, %d26, %d58	;\
    203 	faligndata %d26, %d28, %d60	;\
    204 	faligndata %d28, %d30, %d62
    205 
    206 #define	VIS_BLOCKSIZE		64
    207 
    208 /*
    209  * Size of stack frame in order to accomodate a 64-byte aligned
    210  * floating-point register save area and 2 64-bit temp locations.
    211  * All copy functions use three quadrants of fp registers; to assure a
    212  * block-aligned three block buffer in which to save we must reserve
    213  * four blocks on stack.
    214  *
    215  *    _______________________________________ <-- %fp + STACK_BIAS
    216  *    | We may need to preserve 3 quadrants |
    217  *    | of fp regs, but since we do so with |
    218  *    | BST/BLD we need room in which to    |
    219  *    | align to VIS_BLOCKSIZE bytes.  So   |
    220  *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
    221  *    |-------------------------------------|
    222  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
    223  *    |-------------------------------------|
    224  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
    225  *    ---------------------------------------
    226  */
    227 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
    228 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
    229 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
    230 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
    231 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
    232 
    233 /*
    234  * In FP copies if we do not have preserved data to restore over
    235  * the fp regs we used then we must zero those regs to avoid
    236  * exposing portions of the data to later threads (data security).
    237  */
    238 #define	FZERO				\
    239 	fzero	%f0			;\
    240 	fzero	%f2			;\
    241 	faddd	%f0, %f2, %f4		;\
    242 	fmuld	%f0, %f2, %f6		;\
    243 	faddd	%f0, %f2, %f8		;\
    244 	fmuld	%f0, %f2, %f10		;\
    245 	faddd	%f0, %f2, %f12		;\
    246 	fmuld	%f0, %f2, %f14		;\
    247 	faddd	%f0, %f2, %f16		;\
    248 	fmuld	%f0, %f2, %f18		;\
    249 	faddd	%f0, %f2, %f20		;\
    250 	fmuld	%f0, %f2, %f22		;\
    251 	faddd	%f0, %f2, %f24		;\
    252 	fmuld	%f0, %f2, %f26		;\
    253 	faddd	%f0, %f2, %f28		;\
    254 	fmuld	%f0, %f2, %f30		;\
    255 	faddd	%f0, %f2, %f48		;\
    256 	fmuld	%f0, %f2, %f50		;\
    257 	faddd	%f0, %f2, %f52		;\
    258 	fmuld	%f0, %f2, %f54		;\
    259 	faddd	%f0, %f2, %f56		;\
    260 	fmuld	%f0, %f2, %f58		;\
    261 	faddd	%f0, %f2, %f60		;\
    262 	fmuld	%f0, %f2, %f62
    263 
    264 #if !defined(lint)
    265 
    266 /*
    267  * Macros to save and restore fp registers to/from the stack.
    268  * Used to save and restore in-use fp registers when we want to use FP.
    269  */
    270 #define BST_FP_TOSTACK(tmp1)					\
    271 	/* membar #Sync	*/					;\
    272 	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
    273 	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
    274 	stda	%f0, [tmp1]ASI_BLK_P				;\
    275 	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
    276 	stda	%f16, [tmp1]ASI_BLK_P				;\
    277 	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
    278 	stda	%f48, [tmp1]ASI_BLK_P				;\
    279 	membar	#Sync
    280 
    281 #define	BLD_FP_FROMSTACK(tmp1)					\
    282 	/* membar #Sync - provided at copy completion */	;\
    283 	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
    284 	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
    285 	ldda	[tmp1]ASI_BLK_P, %f0				;\
    286 	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
    287 	ldda	[tmp1]ASI_BLK_P, %f16				;\
    288 	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
    289 	ldda	[tmp1]ASI_BLK_P, %f48				;\
    290 	membar	#Sync
    291 #endif	/* NIAGARA_IMPL */
    292 
    293 #endif	/* lint */
    294 /*
    295  * Copy a block of storage, returning an error code if `from' or
    296  * `to' takes a kernel pagefault which cannot be resolved.
    297  * Returns errno value on pagefault error, 0 if all ok
    298  */
    299 
    300 #if defined(lint)
    301 
    302 /* ARGSUSED */
    303 int
    304 kcopy(const void *from, void *to, size_t count)
    305 { return(0); }
    306 
    307 #else	/* lint */
    308 
    309 	.seg	".text"
    310 	.align	4
    311 
    312 	ENTRY(kcopy)
    313 
    314 #if !defined(NIAGARA_IMPL)
    315 	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    316 	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
    317 	or	%l7, %lo(.copyerr), %l7
    318 	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
    319 	! Note that we carefully do *not* flag the setting of
    320 	! t_lofault.
    321 	membar	#Sync				! sync error barrier
    322 	b	.do_copy			! common code
    323 	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
    324 
    325 /*
    326  * We got here because of a fault during kcopy or bcopy if a fault
    327  * handler existed when bcopy was called.
    328  * Errno value is in %g1.
    329  */
    330 .copyerr:
    331 	sethi	%hi(.copyerr2), %l1
    332 	or	%l1, %lo(.copyerr2), %l1
    333 	membar	#Sync				! sync error barrier
    334 	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
    335 	btst	FPUSED_FLAG, %o5
    336 	bz,pt	%xcc, 1f
    337 	and	%o5, BCOPY_FLAG, %l1	! copy flag to %l1
    338 
    339 	membar	#Sync				! sync error barrier
    340 	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
    341 	wr	%o2, 0, %gsr
    342 
    343 	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
    344 	btst	FPRS_FEF, %o3
    345 	bz,pt	%icc, 4f
    346 	  nop
    347 
    348 	! restore fpregs from stack
    349 	BLD_FP_FROMSTACK(%o2)
    350 
    351 	ba,pt	%ncc, 2f
    352 	  wr	%o3, 0, %fprs		! restore fprs
    353 
    354 4:
    355 	FZERO
    356 	wr	%o3, 0, %fprs		! restore fprs
    357 
    358 2:
    359 	ldn	[THREAD_REG + T_LWP], %o2
    360 	brnz,pt	%o2, 1f
    361 	  nop
    362 
    363 	ldsb	[THREAD_REG + T_PREEMPT], %l0
    364 	deccc	%l0
    365 	bnz,pn	%ncc, 1f
    366 	  stb	%l0, [THREAD_REG + T_PREEMPT]
    367 
    368 	! Check for a kernel preemption request
    369 	ldn	[THREAD_REG + T_CPU], %l0
    370 	ldub	[%l0 + CPU_KPRUNRUN], %l0
    371 	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
    372 	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
    373 
    374 	! The kcopy will always set a t_lofault handler. If it fires,
    375 	! we're expected to just return the error code and not to
    376 	! invoke any existing error handler. As far as bcopy is concerned,
    377 	! we only set t_lofault if there was an existing lofault handler.
    378 	! In that case we're expected to invoke the previously existing
    379 	! handler after restting the t_lofault value.
    380 1:
    381 	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
    382 	membar	#Sync				! sync error barrier
    383 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
    384 
    385 	! call kpreempt if necessary
    386 	btst	KPREEMPT_FLAG, %l1
    387 	bz,pt	%icc, 2f
    388 	  nop
    389 	call	kpreempt
    390 	  rdpr	%pil, %o0	! pass %pil
    391 2:
    392 	btst	BCOPY_FLAG, %l1
    393 	bnz,pn	%ncc, 3f
    394 	nop
    395 	ret
    396 	restore	%g1, 0, %o0
    397 
    398 3:
    399 	! We're here via bcopy. There must have been an error handler
    400 	! in place otherwise we would have died a nasty death already.
    401 	jmp	%o5				! goto real handler
    402 	restore	%g0, 0, %o0			! dispose of copy window
    403 
    404 /*
    405  * We got here because of a fault in .copyerr.  We can't safely restore fp
    406  * state, so we panic.
    407  */
    408 fp_panic_msg:
    409 	.asciz	"Unable to restore fp state after copy operation"
    410 
    411 	.align	4
    412 .copyerr2:
    413 	set	fp_panic_msg, %o0
    414 	call	panic
    415 	  nop
    416 #else	/* NIAGARA_IMPL */
    417 	save	%sp, -SA(MINFRAME), %sp
    418 	set	.copyerr, %l7			! copyerr is lofault value
    419 	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
    420 	or	%o5, LOFAULT_SET, %o5
    421 	membar	#Sync				! sync error barrier
    422 	b	.do_copy			! common code
    423 	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
    424 
    425 /*
    426  * We got here because of a fault during kcopy.
    427  * Errno value is in %g1.
    428  */
    429 .copyerr:
    430 	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
    431 	! into %o5 to indicate it has set t_lofault handler. Need to clear
    432 	! LOFAULT_SET flag before restoring the error handler.
    433 	andn	%o5, LOFAULT_SET, %o5
    434 	membar	#Sync				! sync error barrier
    435 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
    436 	ret
    437 	restore	%g1, 0, %o0
    438 #endif	/* NIAGARA_IMPL */
    439 
    440 	SET_SIZE(kcopy)
    441 #endif	/* lint */
    442 
    443 
    444 /*
    445  * Copy a block of storage - must not overlap (from + len <= to).
    446  */
    447 #if defined(lint)
    448 
    449 /* ARGSUSED */
    450 void
    451 bcopy(const void *from, void *to, size_t count)
    452 {}
    453 
    454 #else	/* lint */
    455 
    456 	ENTRY(bcopy)
    457 
    458 #if !defined(NIAGARA_IMPL)
    459 	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
    460 	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
    461 	brz,pt	%o5, .do_copy
    462 	  nop
    463 	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
    464 	or	%l7, %lo(.copyerr), %l7
    465 	membar	#Sync				! sync error barrier
    466 	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
    467 	! We've already captured whether t_lofault was zero on entry.
    468 	! We need to mark ourselves as being from bcopy since both
    469 	! kcopy and bcopy use the same code path. If BCOPY_FLAG is
    470 	! set and the saved lofault was zero, we won't reset lofault on
    471 	! returning.
    472 	or	%o5, BCOPY_FLAG, %o5
    473 #else	/* NIAGARA_IMPL */
    474 	save	%sp, -SA(MINFRAME), %sp
    475 	clr	%o5			! flag LOFAULT_SET is not set for bcopy
    476 #endif	/* NIAGARA_IMPL */
    477 
    478 .do_copy:
    479 	cmp	%i2, 12			! for small counts
    480 	blu	%ncc, .bytecp		! just copy bytes
    481 	  .empty
    482 
    483 	cmp	%i2, 128		! for less than 128 bytes
    484 	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
    485 	  nop
    486 
    487 	set	use_hw_bcopy, %o2
    488 	ld	[%o2], %o2
    489 	brz,pn	%o2, .bcb_punt
    490 	  nop
    491 
    492 	subcc	%i1, %i0, %i3
    493 	bneg,a,pn %ncc, 1f
    494 	neg	%i3
    495 1:
    496 	/*
    497 	 * Compare against 256 since we should be checking block addresses
    498 	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
    499 	 * src = dest + (64 * 3) + 63.
    500 	 */
    501 	cmp	%i3, 256
    502 	blu,pn	%ncc, .bcb_punt
    503 	  nop
    504 
    505 	/*
    506 	 * Copy that reach here have at least 2 blocks of data to copy.
    507 	 */
    508 #if !defined(NIAGARA_IMPL)
    509 	ldn	[THREAD_REG + T_LWP], %o3
    510 	brnz,pt	%o3, 1f
    511 	  nop
    512 
    513 	! kpreempt_disable();
    514 	ldsb	[THREAD_REG + T_PREEMPT], %o2
    515 	inc	%o2
    516 	stb	%o2, [THREAD_REG + T_PREEMPT]
    517 
    518 1:
    519 	rd	%fprs, %o2              ! check for unused fp
    520 	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
    521 	btst	FPRS_FEF, %o2
    522 	bz,a,pt	%icc, .do_blockcopy
    523 	wr	%g0, FPRS_FEF, %fprs
    524 
    525 	! save in-use fpregs on stack
    526 	BST_FP_TOSTACK(%o2)
    527 #endif	/* NIAGARA_IMPL */
    528 
    529 .do_blockcopy:
    530 
    531 #if !defined(NIAGARA_IMPL)
    532 	rd	%gsr, %o2
    533 	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
    534 	or	%o5, FPUSED_FLAG, %o5		! fp regs are in use
    535 #endif	/* NIAGARA_IMPL */
    536 
    537 	! Swap src/dst since the code below is memcpy code
    538 	! and memcpy/bcopy have different calling sequences
    539 	mov	%i1, %i5
    540 	mov	%i0, %i1
    541 	mov	%i5, %i0
    542 
    543 	! Block (64 bytes) align the destination.
    544 	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
    545 	bz	%xcc, .chksrc		! dst is already double aligned
    546 	sub	%i3, 0x40, %i3
    547 	neg	%i3			! bytes till dst 64 bytes aligned
    548 	sub	%i2, %i3, %i2		! update i2 with new count
    549 
    550 	! Based on source and destination alignment do
    551 	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
    552 
    553 	! Is dst & src 8B aligned
    554 	or	%i0, %i1, %o2
    555 	andcc	%o2, 0x7, %g0
    556 	bz	%ncc, .alewdcp
    557 	nop
    558 
    559 	! Is dst & src 4B aligned
    560 	andcc	%o2, 0x3, %g0
    561 	bz	%ncc, .alwdcp
    562 	nop
    563 
    564 	! Is dst & src 2B aligned
    565 	andcc	%o2, 0x1, %g0
    566 	bz	%ncc, .alhlfwdcp
    567 	nop
    568 
    569 	! 1B aligned
    570 1:	ldub	[%i1], %o2
    571 	stb	%o2, [%i0]
    572 	inc	%i1
    573 	deccc	%i3
    574 	bgu,pt	%ncc, 1b
    575 	inc	%i0
    576 
    577 	ba	.chksrc
    578 	nop
    579 
    580 	! dst & src 4B aligned
    581 .alwdcp:
    582 	ld	[%i1], %o2
    583 	st	%o2, [%i0]
    584 	add	%i1, 0x4, %i1
    585 	subcc	%i3, 0x4, %i3
    586 	bgu,pt	%ncc, .alwdcp
    587 	add	%i0, 0x4, %i0
    588 
    589 	ba	.chksrc
    590 	nop
    591 
    592 	! dst & src 2B aligned
    593 .alhlfwdcp:
    594 	lduh	[%i1], %o2
    595 	stuh	%o2, [%i0]
    596 	add	%i1, 0x2, %i1
    597 	subcc	%i3, 0x2, %i3
    598 	bgu,pt	%ncc, .alhlfwdcp
    599 	add	%i0, 0x2, %i0
    600 
    601 	ba	.chksrc
    602 	nop
    603 
    604 	! dst & src 8B aligned
    605 .alewdcp:
    606 	ldx	[%i1], %o2
    607 	stx	%o2, [%i0]
    608 	add	%i1, 0x8, %i1
    609 	subcc	%i3, 0x8, %i3
    610 	bgu,pt	%ncc, .alewdcp
    611 	add	%i0, 0x8, %i0
    612 
    613 	! Now Destination is block (64 bytes) aligned
    614 .chksrc:
    615 	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
    616 	sub	%i2, %i3, %i2		! Residue bytes in %i2
    617 
    618 	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
    619 
    620 #if !defined(NIAGARA_IMPL)
    621 	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
    622 	prefetch [%l0+0x0], #one_read
    623 	andcc	%i1, 0x3f, %g0		! is src 64B aligned
    624 	bz,pn	%ncc, .blkcpy
    625 	nop
    626 
    627 	! handle misaligned source cases
    628 	alignaddr %i1, %g0, %g0		! generate %gsr
    629 
    630 	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
    631 					! significant in %l1
    632 	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
    633 	add	%i1, %i3, %i1
    634 
    635 	! switch statement to get to right 8 byte block within
    636 	! 64 byte block
    637 	cmp	 %l2, 0x4
    638 	bgeu,a	 hlf
    639 	cmp	 %l2, 0x6
    640 	cmp	 %l2, 0x2
    641 	bgeu,a	 sqtr
    642 	nop
    643 	cmp	 %l2, 0x1
    644 	be,a	 off15
    645 	nop
    646 	ba	 off7
    647 	nop
    648 sqtr:
    649 	be,a	 off23
    650 	nop
    651 	ba,a	 off31
    652 	nop
    653 
    654 hlf:
    655 	bgeu,a	 fqtr
    656 	nop
    657 	cmp	 %l2, 0x5
    658 	be,a	 off47
    659 	nop
    660 	ba	 off39
    661 	nop
    662 fqtr:
    663 	be,a	 off55
    664 	nop
    665 
    666 	! Falls through when the source offset is greater than 56
    667 	ldd	[%l0+0x38], %d14
    668 	prefetch [%l0+0x40], #one_read
    669 	prefetch [%l0+0x80], #one_read
    670 7:
    671 	add	%l0, 0x40, %l0
    672 	stxa	%g0, [%i0]%asi		! initialize the cache line
    673 
    674 	ldda	[%l0]ASI_BLK_P, %d16
    675 	ALIGN_OFF_56_63
    676 	fsrc1	%d30, %d14
    677 
    678 	stda	%d48, [%i0]ASI_BLK_P
    679 	subcc	%i3, 0x40, %i3
    680 	add	%i0, 0x40, %i0
    681 	bgu,pt	%ncc, 7b
    682 	prefetch [%l0+0x80], #one_read
    683 	ba	.blkdone
    684 	membar	#Sync
    685 
    686 	! This copy case for source offset between 1 and 7
    687 off7:
    688 	ldda	[%l0]ASI_BLK_P, %d0
    689 	prefetch [%l0+0x40], #one_read
    690 	prefetch [%l0+0x80], #one_read
    691 0:
    692 	add	%l0, 0x40, %l0
    693 	stxa	%g0, [%i0]%asi		! initialize the cache line
    694 
    695 	ldda	[%l0]ASI_BLK_P, %d16
    696 	ALIGN_OFF_1_7
    697 	fsrc1	%d16, %d0
    698 	fsrc1	%d18, %d2
    699 	fsrc1	%d20, %d4
    700 	fsrc1	%d22, %d6
    701 	fsrc1	%d24, %d8
    702 	fsrc1	%d26, %d10
    703 	fsrc1	%d28, %d12
    704 	fsrc1	%d30, %d14
    705 
    706 	stda	%d48, [%i0]ASI_BLK_P
    707 	subcc	%i3, 0x40, %i3
    708 	add	%i0, 0x40, %i0
    709 	bgu,pt	%ncc, 0b
    710 	prefetch [%l0+0x80], #one_read
    711 	ba	.blkdone
    712 	membar	#Sync
    713 
    714 	! This copy case for source offset between 8 and 15
    715 off15:
    716 	ldd	[%l0+0x8], %d2
    717 	ldd	[%l0+0x10], %d4
    718 	ldd	[%l0+0x18], %d6
    719 	ldd	[%l0+0x20], %d8
    720 	ldd	[%l0+0x28], %d10
    721 	ldd	[%l0+0x30], %d12
    722 	ldd	[%l0+0x38], %d14
    723 	prefetch [%l0+0x40], #one_read
    724 	prefetch [%l0+0x80], #one_read
    725 1:
    726 	add	%l0, 0x40, %l0
    727 	stxa	%g0, [%i0]%asi		! initialize the cache line
    728 
    729 	ldda	[%l0]ASI_BLK_P, %d16
    730 	ALIGN_OFF_8_15
    731 	fsrc1	%d18, %d2
    732 	fsrc1	%d20, %d4
    733 	fsrc1	%d22, %d6
    734 	fsrc1	%d24, %d8
    735 	fsrc1	%d26, %d10
    736 	fsrc1	%d28, %d12
    737 	fsrc1	%d30, %d14
    738 
    739 	stda	%d48, [%i0]ASI_BLK_P
    740 	subcc	%i3, 0x40, %i3
    741 	add	%i0, 0x40, %i0
    742 	bgu,pt	%ncc, 1b
    743 	prefetch [%l0+0x80], #one_read
    744 	ba	.blkdone
    745 	membar	#Sync
    746 
    747 	! This copy case for source offset between 16 and 23
    748 off23:
    749 	ldd	[%l0+0x10], %d4
    750 	ldd	[%l0+0x18], %d6
    751 	ldd	[%l0+0x20], %d8
    752 	ldd	[%l0+0x28], %d10
    753 	ldd	[%l0+0x30], %d12
    754 	ldd	[%l0+0x38], %d14
    755 	prefetch [%l0+0x40], #one_read
    756 	prefetch [%l0+0x80], #one_read
    757 2:
    758 	add	%l0, 0x40, %l0
    759 	stxa	%g0, [%i0]%asi		! initialize the cache line
    760 
    761 	ldda	[%l0]ASI_BLK_P, %d16
    762 	ALIGN_OFF_16_23
    763 	fsrc1	%d20, %d4
    764 	fsrc1	%d22, %d6
    765 	fsrc1	%d24, %d8
    766 	fsrc1	%d26, %d10
    767 	fsrc1	%d28, %d12
    768 	fsrc1	%d30, %d14
    769 
    770 	stda	%d48, [%i0]ASI_BLK_P
    771 	subcc	%i3, 0x40, %i3
    772 	add	%i0, 0x40, %i0
    773 	bgu,pt	%ncc, 2b
    774 	prefetch [%l0+0x80], #one_read
    775 	ba	.blkdone
    776 	membar	#Sync
    777 
    778 	! This copy case for source offset between 24 and 31
    779 off31:
    780 	ldd	[%l0+0x18], %d6
    781 	ldd	[%l0+0x20], %d8
    782 	ldd	[%l0+0x28], %d10
    783 	ldd	[%l0+0x30], %d12
    784 	ldd	[%l0+0x38], %d14
    785 	prefetch [%l0+0x40], #one_read
    786 	prefetch [%l0+0x80], #one_read
    787 3:
    788 	add	%l0, 0x40, %l0
    789 	stxa	%g0, [%i0]%asi		! initialize the cache line
    790 
    791 	ldda	[%l0]ASI_BLK_P, %d16
    792 	ALIGN_OFF_24_31
    793 	fsrc1	%d22, %d6
    794 	fsrc1	%d24, %d8
    795 	fsrc1	%d26, %d10
    796 	fsrc1	%d28, %d12
    797 	fsrc1	%d30, %d14
    798 
    799 	stda	%d48, [%i0]ASI_BLK_P
    800 	subcc	%i3, 0x40, %i3
    801 	add	%i0, 0x40, %i0
    802 	bgu,pt	%ncc, 3b
    803 	prefetch [%l0+0x80], #one_read
    804 	ba	.blkdone
    805 	membar	#Sync
    806 
    807 	! This copy case for source offset between 32 and 39
    808 off39:
    809 	ldd	[%l0+0x20], %d8
    810 	ldd	[%l0+0x28], %d10
    811 	ldd	[%l0+0x30], %d12
    812 	ldd	[%l0+0x38], %d14
    813 	prefetch [%l0+0x40], #one_read
    814 	prefetch [%l0+0x80], #one_read
    815 4:
    816 	add	%l0, 0x40, %l0
    817 	stxa	%g0, [%i0]%asi		! initialize the cache line
    818 
    819 	ldda	[%l0]ASI_BLK_P, %d16
    820 	ALIGN_OFF_32_39
    821 	fsrc1	%d24, %d8
    822 	fsrc1	%d26, %d10
    823 	fsrc1	%d28, %d12
    824 	fsrc1	%d30, %d14
    825 
    826 	stda	%d48, [%i0]ASI_BLK_P
    827 	subcc	%i3, 0x40, %i3
    828 	add	%i0, 0x40, %i0
    829 	bgu,pt	%ncc, 4b
    830 	prefetch [%l0+0x80], #one_read
    831 	ba	.blkdone
    832 	membar	#Sync
    833 
    834 	! This copy case for source offset between 40 and 47
    835 off47:
    836 	ldd	[%l0+0x28], %d10
    837 	ldd	[%l0+0x30], %d12
    838 	ldd	[%l0+0x38], %d14
    839 	prefetch [%l0+0x40], #one_read
    840 	prefetch [%l0+0x80], #one_read
    841 5:
    842 	add	%l0, 0x40, %l0
    843 	stxa	%g0, [%i0]%asi		! initialize the cache line
    844 
    845 	ldda	[%l0]ASI_BLK_P, %d16
    846 	ALIGN_OFF_40_47
    847 	fsrc1	%d26, %d10
    848 	fsrc1	%d28, %d12
    849 	fsrc1	%d30, %d14
    850 
    851 	stda	%d48, [%i0]ASI_BLK_P
    852 	subcc	%i3, 0x40, %i3
    853 	add	%i0, 0x40, %i0
    854 	bgu,pt	%ncc, 5b
    855 	prefetch [%l0+0x80], #one_read
    856 	ba	.blkdone
    857 	membar	#Sync
    858 
    859 	! This copy case for source offset between 48 and 55
    860 off55:
    861 	ldd	[%l0+0x30], %d12
    862 	ldd	[%l0+0x38], %d14
    863 	prefetch [%l0+0x40], #one_read
    864 	prefetch [%l0+0x80], #one_read
    865 6:
    866 	add	%l0, 0x40, %l0
    867 	stxa	%g0, [%i0]%asi		! initialize the cache line
    868 
    869 	ldda	[%l0]ASI_BLK_P, %d16
    870 	ALIGN_OFF_48_55
    871 	fsrc1	%d28, %d12
    872 	fsrc1	%d30, %d14
    873 
    874 	stda	%d48, [%i0]ASI_BLK_P
    875 	subcc	%i3, 0x40, %i3
    876 	add	%i0, 0x40, %i0
    877 	bgu,pt	%ncc, 6b
    878 	prefetch [%l0+0x80], #one_read
    879 	ba	.blkdone
    880 	membar	#Sync
    881 
    882 	! Both source and destination are block aligned.
    883 .blkcpy:
    884 	prefetch [%i1+0x40], #one_read
    885 	prefetch [%i1+0x80], #one_read
    886 8:
    887 	stxa	%g0, [%i0]%asi		! initialize the cache line
    888 	ldda	[%i1]ASI_BLK_P, %d0
    889 	stda	%d0, [%i0]ASI_BLK_P
    890 
    891 	add	%i1, 0x40, %i1
    892 	subcc	%i3, 0x40, %i3
    893 	add	%i0, 0x40, %i0
    894 	bgu,pt	%ncc, 8b
    895 	prefetch [%i1+0x80], #one_read
    896 	membar	#Sync
    897 
    898 .blkdone:
    899 #else	/* NIAGARA_IMPL */
    900 	andcc	%i1, 0xf, %o2		! is src quadword aligned
    901 	bz,pn	%xcc, .blkcpy		! src offset in %o2
    902 	nop
    903 	cmp	%o2, 0x8
    904 	bg	.cpy_upper_double
    905 	nop
    906 	bl	.cpy_lower_double
    907 	nop
    908 
    909 	! Falls through when source offset is equal to 8 i.e.
    910 	! source is double word aligned.
    911 	! In this case no shift/merge of data is required
    912 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
    913 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
    914 	prefetch [%l0+0x0], #one_read
    915 	ldda	[%i1+0x0]%asi, %l2
    916 loop0:
    917 	ldda	[%i1+0x10]%asi, %l4
    918 	prefetch [%l0+0x40], #one_read
    919 
    920 	stxa	%l3, [%i0+0x0]%asi
    921 	stxa	%l4, [%i0+0x8]%asi
    922 
    923 	ldda	[%i1+0x20]%asi, %l2
    924 	stxa	%l5, [%i0+0x10]%asi
    925 	stxa	%l2, [%i0+0x18]%asi
    926 
    927 	ldda	[%i1+0x30]%asi, %l4
    928 	stxa	%l3, [%i0+0x20]%asi
    929 	stxa	%l4, [%i0+0x28]%asi
    930 
    931 	ldda	[%i1+0x40]%asi, %l2
    932 	stxa	%l5, [%i0+0x30]%asi
    933 	stxa	%l2, [%i0+0x38]%asi
    934 
    935 	add	%l0, 0x40, %l0
    936 	add	%i1, 0x40, %i1
    937 	subcc	%i3, 0x40, %i3
    938 	bgu,pt	%xcc, loop0
    939 	add	%i0, 0x40, %i0
    940 	ba	.blkdone
    941 	add	%i1, %o2, %i1		! increment the source by src offset
    942 					! the src offset was stored in %o2
    943 
    944 .cpy_lower_double:
    945 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
    946 	sll	%o2, 3, %o0		! %o0 left shift
    947 	mov	0x40, %o1
    948 	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
    949 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
    950 	prefetch [%l0+0x0], #one_read
    951 	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
    952 					! complete data
    953 loop1:
    954 	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
    955 	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
    956 							! into %l2 and %l3
    957 	prefetch [%l0+0x40], #one_read
    958 	stxa	%l2, [%i0+0x0]%asi
    959 	stxa	%l3, [%i0+0x8]%asi
    960 
    961 	ldda	[%i1+0x20]%asi, %l2
    962 	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
    963 	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
    964 	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
    965 
    966 	! Repeat the same for next 32 bytes.
    967 
    968 	ldda	[%i1+0x30]%asi, %l4
    969 	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
    970 	stxa	%l2, [%i0+0x20]%asi
    971 	stxa	%l3, [%i0+0x28]%asi
    972 
    973 	ldda	[%i1+0x40]%asi, %l2
    974 	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
    975 	stxa	%l4, [%i0+0x30]%asi
    976 	stxa	%l5, [%i0+0x38]%asi
    977 
    978 	add	%l0, 0x40, %l0
    979 	add	%i1, 0x40, %i1
    980 	subcc	%i3, 0x40, %i3
    981 	bgu,pt	%xcc, loop1
    982 	add	%i0, 0x40, %i0
    983 	ba	.blkdone
    984 	add	%i1, %o2, %i1		! increment the source by src offset
    985 					! the src offset was stored in %o2
    986 
    987 .cpy_upper_double:
    988 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
    989 	mov	0x8, %o0
    990 	sub	%o2, %o0, %o0
    991 	sll	%o0, 3, %o0		! %o0 left shift
    992 	mov	0x40, %o1
    993 	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
    994 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
    995 	prefetch [%l0+0x0], #one_read
    996 	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
    997 					! no data in %l2
    998 loop2:
    999 	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
   1000 					! partial
   1001 	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
   1002 							! into %l3 and %l4
   1003 	prefetch [%l0+0x40], #one_read
   1004 	stxa	%l3, [%i0+0x0]%asi
   1005 	stxa	%l4, [%i0+0x8]%asi
   1006 
   1007 	ldda	[%i1+0x20]%asi, %l2
   1008 	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
   1009 	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
   1010 	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
   1011 
   1012 	! Repeat the same for next 32 bytes.
   1013 
   1014 	ldda	[%i1+0x30]%asi, %l4
   1015 	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
   1016 	stxa	%l3, [%i0+0x20]%asi
   1017 	stxa	%l4, [%i0+0x28]%asi
   1018 
   1019 	ldda	[%i1+0x40]%asi, %l2
   1020 	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
   1021 	stxa	%l5, [%i0+0x30]%asi
   1022 	stxa	%l2, [%i0+0x38]%asi
   1023 
   1024 	add	%l0, 0x40, %l0
   1025 	add	%i1, 0x40, %i1
   1026 	subcc	%i3, 0x40, %i3
   1027 	bgu,pt	%xcc, loop2
   1028 	add	%i0, 0x40, %i0
   1029 	ba	.blkdone
   1030 	add	%i1, %o2, %i1		! increment the source by src offset
   1031 					! the src offset was stored in %o2
   1032 
   1033 
   1034 	! Both Source and Destination are block aligned.
   1035 	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
   1036 .blkcpy:
   1037 	prefetch [%i1+0x0], #one_read
   1038 1:
   1039 	ldda	[%i1+0x0]%asi, %l0
   1040 	ldda	[%i1+0x10]%asi, %l2
   1041 	prefetch [%i1+0x40], #one_read
   1042 
   1043 	stxa	%l0, [%i0+0x0]%asi
   1044 	ldda	[%i1+0x20]%asi, %l4
   1045 	ldda	[%i1+0x30]%asi, %l6
   1046 
   1047 	stxa	%l1, [%i0+0x8]%asi
   1048 	stxa	%l2, [%i0+0x10]%asi
   1049 	stxa	%l3, [%i0+0x18]%asi
   1050 	stxa	%l4, [%i0+0x20]%asi
   1051 	stxa	%l5, [%i0+0x28]%asi
   1052 	stxa	%l6, [%i0+0x30]%asi
   1053 	stxa	%l7, [%i0+0x38]%asi
   1054 
   1055 	add	%i1, 0x40, %i1
   1056 	subcc	%i3, 0x40, %i3
   1057 	bgu,pt	%xcc, 1b
   1058 	add	%i0, 0x40, %i0
   1059 
   1060 .blkdone:
   1061 	membar	#Sync
   1062 #endif	/* NIAGARA_IMPL */
   1063 
   1064 	brz,pt	%i2, .blkexit
   1065 	nop
   1066 
   1067 	! Handle trailing bytes
   1068 	cmp	%i2, 0x8
   1069 	blu,pt	%ncc, .residue
   1070 	nop
   1071 
   1072 	! Can we do some 8B ops
   1073 	or	%i1, %i0, %o2
   1074 	andcc	%o2, 0x7, %g0
   1075 	bnz	%ncc, .last4
   1076 	nop
   1077 
   1078 	! Do 8byte ops as long as possible
   1079 .last8:
   1080 	ldx	[%i1], %o2
   1081 	stx	%o2, [%i0]
   1082 	add	%i1, 0x8, %i1
   1083 	sub	%i2, 0x8, %i2
   1084 	cmp	%i2, 0x8
   1085 	bgu,pt	%ncc, .last8
   1086 	add	%i0, 0x8, %i0
   1087 
   1088 	brz,pt	%i2, .blkexit
   1089 	nop
   1090 
   1091 	ba	.residue
   1092 	nop
   1093 
   1094 .last4:
   1095 	! Can we do 4B ops
   1096 	andcc	%o2, 0x3, %g0
   1097 	bnz	%ncc, .last2
   1098 	nop
   1099 1:
   1100 	ld	[%i1], %o2
   1101 	st	%o2, [%i0]
   1102 	add	%i1, 0x4, %i1
   1103 	sub	%i2, 0x4, %i2
   1104 	cmp	%i2, 0x4
   1105 	bgu,pt	%ncc, 1b
   1106 	add	%i0, 0x4, %i0
   1107 
   1108 	brz,pt	%i2, .blkexit
   1109 	nop
   1110 
   1111 	ba	.residue
   1112 	nop
   1113 
   1114 .last2:
   1115 	! Can we do 2B ops
   1116 	andcc	%o2, 0x1, %g0
   1117 	bnz	%ncc, .residue
   1118 	nop
   1119 
   1120 1:
   1121 	lduh	[%i1], %o2
   1122 	stuh	%o2, [%i0]
   1123 	add	%i1, 0x2, %i1
   1124 	sub	%i2, 0x2, %i2
   1125 	cmp	%i2, 0x2
   1126 	bgu,pt	%ncc, 1b
   1127 	add	%i0, 0x2, %i0
   1128 
   1129 	brz,pt	%i2, .blkexit
   1130 	nop
   1131 
   1132 .residue:
   1133 	ldub	[%i1], %o2
   1134 	stb	%o2, [%i0]
   1135 	inc	%i1
   1136 	deccc	%i2
   1137 	bgu,pt	%ncc, .residue
   1138 	inc	%i0
   1139 
   1140 .blkexit:
   1141 #if !defined(NIAGARA_IMPL)
   1142 	btst	FPUSED_FLAG, %o5
   1143 	bz	%icc, 1f
   1144 	  and	%o5,  COPY_FLAGS, %l1	! Store flags in %l1
   1145 					! We can't clear the flags from %o5 yet
   1146 					! If there's an error, .copyerr will
   1147 					! need them
   1148 
   1149 	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
   1150 	wr	%o2, 0, %gsr
   1151 
   1152 	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
   1153 	btst	FPRS_FEF, %o3
   1154 	bz,pt	%icc, 4f
   1155 	  nop
   1156 
   1157 	! restore fpregs from stack
   1158 	BLD_FP_FROMSTACK(%o2)
   1159 
   1160 	ba,pt	%ncc, 2f
   1161 	  wr	%o3, 0, %fprs		! restore fprs
   1162 
   1163 4:
   1164 	FZERO
   1165 	wr	%o3, 0, %fprs		! restore fprs
   1166 
   1167 2:
   1168 	ldn	[THREAD_REG + T_LWP], %o2
   1169 	brnz,pt	%o2, 1f
   1170 	  nop
   1171 
   1172 	ldsb	[THREAD_REG + T_PREEMPT], %l0
   1173 	deccc	%l0
   1174 	bnz,pn	%ncc, 1f
   1175 	  stb	%l0, [THREAD_REG + T_PREEMPT]
   1176 
   1177 	! Check for a kernel preemption request
   1178 	ldn	[THREAD_REG + T_CPU], %l0
   1179 	ldub	[%l0 + CPU_KPRUNRUN], %l0
   1180 	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
   1181 	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
   1182 
   1183 1:
   1184 	btst	BCOPY_FLAG, %l1
   1185 	bz,pn	%icc, 3f
   1186 	andncc	%o5, COPY_FLAGS, %o5
   1187 
   1188 	! Here via bcopy. Check to see if the handler was NULL.
   1189 	! If so, just return quietly. Otherwise, reset the
   1190 	! handler and go home.
   1191 	bnz,pn	%ncc, 3f
   1192 	nop
   1193 
   1194 	! Null handler.
   1195 	btst	KPREEMPT_FLAG, %l1
   1196 	bz,pt	%icc, 2f
   1197 	  nop
   1198 	call	kpreempt
   1199 	  rdpr	%pil, %o0	! pass %pil
   1200 2:
   1201 
   1202 	ret
   1203 	restore	%g0, 0, %o0
   1204 
   1205 	! Here via kcopy or bcopy with a handler.
   1206 	! Reset the fault handler.
   1207 3:
   1208 	membar	#Sync
   1209 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   1210 
   1211 	! call kpreempt if necessary
   1212 	btst	KPREEMPT_FLAG, %l1
   1213 	bz,pt	%icc, 4f
   1214 	  nop
   1215 	call	kpreempt
   1216 	  rdpr	%pil, %o0
   1217 4:
   1218 #else	/* NIAGARA_IMPL */
   1219 	membar	#Sync				! sync error barrier
   1220 	! Restore t_lofault handler, if came here from kcopy().
   1221 	tst	%o5
   1222 	bz	%ncc, 1f
   1223 	andn	%o5, LOFAULT_SET, %o5
   1224 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   1225 1:
   1226 #endif	/* NIAGARA_IMPL */
   1227 	ret
   1228 	restore	%g0, 0, %o0
   1229 
   1230 .bcb_punt:
   1231 	!
   1232 	! use aligned transfers where possible
   1233 	!
   1234 	xor	%i0, %i1, %o4		! xor from and to address
   1235 	btst	7, %o4			! if lower three bits zero
   1236 	bz	.aldoubcp		! can align on double boundary
   1237 	.empty	! assembler complaints about label
   1238 
   1239 	xor	%i0, %i1, %o4		! xor from and to address
   1240 	btst	3, %o4			! if lower two bits zero
   1241 	bz	.alwordcp		! can align on word boundary
   1242 	btst	3, %i0			! delay slot, from address unaligned?
   1243 	!
   1244 	! use aligned reads and writes where possible
   1245 	! this differs from wordcp in that it copes
   1246 	! with odd alignment between source and destnation
   1247 	! using word reads and writes with the proper shifts
   1248 	! in between to align transfers to and from memory
   1249 	! i0 - src address, i1 - dest address, i2 - count
   1250 	! i3, i4 - tmps for used generating complete word
   1251 	! i5 (word to write)
   1252 	! l0 size in bits of upper part of source word (US)
   1253 	! l1 size in bits of lower part of source word (LS = 32 - US)
   1254 	! l2 size in bits of upper part of destination word (UD)
   1255 	! l3 size in bits of lower part of destination word (LD = 32 - UD)
   1256 	! l4 number of bytes leftover after aligned transfers complete
   1257 	! l5 the number 32
   1258 	!
   1259 	mov	32, %l5			! load an oft-needed constant
   1260 	bz	.align_dst_only
   1261 	btst	3, %i1			! is destnation address aligned?
   1262 	clr	%i4			! clear registers used in either case
   1263 	bz	.align_src_only
   1264 	clr	%l0
   1265 	!
   1266 	! both source and destination addresses are unaligned
   1267 	!
   1268 1:					! align source
   1269 	ldub	[%i0], %i3		! read a byte from source address
   1270 	add	%i0, 1, %i0		! increment source address
   1271 	or	%i4, %i3, %i4		! or in with previous bytes (if any)
   1272 	btst	3, %i0			! is source aligned?
   1273 	add	%l0, 8, %l0		! increment size of upper source (US)
   1274 	bnz,a	1b
   1275 	sll	%i4, 8, %i4		! make room for next byte
   1276 
   1277 	sub	%l5, %l0, %l1		! generate shift left count (LS)
   1278 	sll	%i4, %l1, %i4		! prepare to get rest
   1279 	ld	[%i0], %i3		! read a word
   1280 	add	%i0, 4, %i0		! increment source address
   1281 	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
   1282 	or	%i4, %i5, %i5		! merge
   1283 	mov	24, %l3			! align destination
   1284 1:
   1285 	srl	%i5, %l3, %i4		! prepare to write a single byte
   1286 	stb	%i4, [%i1]		! write a byte
   1287 	add	%i1, 1, %i1		! increment destination address
   1288 	sub	%i2, 1, %i2		! decrement count
   1289 	btst	3, %i1			! is destination aligned?
   1290 	bnz,a	1b
   1291 	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
   1292 	sub	%l5, %l3, %l2		! generate shift left count (UD)
   1293 	sll	%i5, %l2, %i5		! move leftover into upper bytes
   1294 	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
   1295 	bgu	%ncc, .more_needed	! need more to fill than we have
   1296 	nop
   1297 
   1298 	sll	%i3, %l1, %i3		! clear upper used byte(s)
   1299 	srl	%i3, %l1, %i3
   1300 	! get the odd bytes between alignments
   1301 	sub	%l0, %l2, %l0		! regenerate shift count
   1302 	sub	%l5, %l0, %l1		! generate new shift left count (LS)
   1303 	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
   1304 	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
   1305 	srl	%i3, %l0, %i4
   1306 	or	%i5, %i4, %i5
   1307 	st	%i5, [%i1]		! write a word
   1308 	subcc	%i2, 4, %i2		! decrement count
   1309 	bz	%ncc, .unalign_out
   1310 	add	%i1, 4, %i1		! increment destination address
   1311 
   1312 	b	2f
   1313 	sll	%i3, %l1, %i5		! get leftover into upper bits
   1314 .more_needed:
   1315 	sll	%i3, %l0, %i3		! save remaining byte(s)
   1316 	srl	%i3, %l0, %i3
   1317 	sub	%l2, %l0, %l1		! regenerate shift count
   1318 	sub	%l5, %l1, %l0		! generate new shift left count
   1319 	sll	%i3, %l1, %i4		! move to fill empty space
   1320 	b	3f
   1321 	or	%i5, %i4, %i5		! merge to complete word
   1322 	!
   1323 	! the source address is aligned and destination is not
   1324 	!
   1325 .align_dst_only:
   1326 	ld	[%i0], %i4		! read a word
   1327 	add	%i0, 4, %i0		! increment source address
   1328 	mov	24, %l0			! initial shift alignment count
   1329 1:
   1330 	srl	%i4, %l0, %i3		! prepare to write a single byte
   1331 	stb	%i3, [%i1]		! write a byte
   1332 	add	%i1, 1, %i1		! increment destination address
   1333 	sub	%i2, 1, %i2		! decrement count
   1334 	btst	3, %i1			! is destination aligned?
   1335 	bnz,a	1b
   1336 	sub	%l0, 8, %l0		! delay slot, decrement shift count
   1337 .xfer:
   1338 	sub	%l5, %l0, %l1		! generate shift left count
   1339 	sll	%i4, %l1, %i5		! get leftover
   1340 3:
   1341 	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
   1342 	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
   1343 2:
   1344 	ld	[%i0], %i3		! read a source word
   1345 	add	%i0, 4, %i0		! increment source address
   1346 	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
   1347 	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
   1348 	st	%i5, [%i1]		! write a destination word
   1349 	subcc	%i2, 4, %i2		! decrement count
   1350 	bz	%ncc, .unalign_out	! check if done
   1351 	add	%i1, 4, %i1		! increment destination address
   1352 	b	2b			! loop
   1353 	sll	%i3, %l1, %i5		! get leftover
   1354 .unalign_out:
   1355 	tst	%l4			! any bytes leftover?
   1356 	bz	%ncc, .cpdone
   1357 	.empty				! allow next instruction in delay slot
   1358 1:
   1359 	sub	%l0, 8, %l0		! decrement shift
   1360 	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
   1361 	stb	%i4, [%i1]		! write a byte
   1362 	subcc	%l4, 1, %l4		! decrement count
   1363 	bz	%ncc, .cpdone		! done?
   1364 	add	%i1, 1, %i1		! increment destination
   1365 	tst	%l0			! any more previously read bytes
   1366 	bnz	%ncc, 1b		! we have leftover bytes
   1367 	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
   1368 	b	.dbytecp		! let dbytecp do the rest
   1369 	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
   1370 	!
   1371 	! the destination address is aligned and the source is not
   1372 	!
   1373 .align_src_only:
   1374 	ldub	[%i0], %i3		! read a byte from source address
   1375 	add	%i0, 1, %i0		! increment source address
   1376 	or	%i4, %i3, %i4		! or in with previous bytes (if any)
   1377 	btst	3, %i0			! is source aligned?
   1378 	add	%l0, 8, %l0		! increment shift count (US)
   1379 	bnz,a	.align_src_only
   1380 	sll	%i4, 8, %i4		! make room for next byte
   1381 	b,a	.xfer
   1382 	!
   1383 	! if from address unaligned for double-word moves,
   1384 	! move bytes till it is, if count is < 56 it could take
   1385 	! longer to align the thing than to do the transfer
   1386 	! in word size chunks right away
   1387 	!
   1388 .aldoubcp:
   1389 	cmp	%i2, 56			! if count < 56, use wordcp, it takes
   1390 	blu,a	%ncc, .alwordcp		! longer to align doubles than words
   1391 	mov	3, %o0			! mask for word alignment
   1392 	call	.alignit		! copy bytes until aligned
   1393 	mov	7, %o0			! mask for double alignment
   1394 	!
   1395 	! source and destination are now double-word aligned
   1396 	! i3 has aligned count returned by alignit
   1397 	!
   1398 	and	%i2, 7, %i2		! unaligned leftover count
   1399 	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
   1400 5:
   1401 	ldx	[%i0+%i1], %o4		! read from address
   1402 	stx	%o4, [%i1]		! write at destination address
   1403 	subcc	%i3, 8, %i3		! dec count
   1404 	bgu	%ncc, 5b
   1405 	add	%i1, 8, %i1		! delay slot, inc to address
   1406 	cmp	%i2, 4			! see if we can copy a word
   1407 	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
   1408 	.empty
   1409 	!
   1410 	! for leftover bytes we fall into wordcp, if needed
   1411 	!
   1412 .wordcp:
   1413 	and	%i2, 3, %i2		! unaligned leftover count
   1414 5:
   1415 	ld	[%i0+%i1], %o4		! read from address
   1416 	st	%o4, [%i1]		! write at destination address
   1417 	subcc	%i3, 4, %i3		! dec count
   1418 	bgu	%ncc, 5b
   1419 	add	%i1, 4, %i1		! delay slot, inc to address
   1420 	b,a	.dbytecp
   1421 
   1422 	! we come here to align copies on word boundaries
   1423 .alwordcp:
   1424 	call	.alignit		! go word-align it
   1425 	mov	3, %o0			! bits that must be zero to be aligned
   1426 	b	.wordcp
   1427 	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
   1428 
   1429 	!
   1430 	! byte copy, works with any alignment
   1431 	!
   1432 .bytecp:
   1433 	b	.dbytecp
   1434 	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
   1435 
   1436 	!
   1437 	! differenced byte copy, works with any alignment
   1438 	! assumes dest in %i1 and (source - dest) in %i0
   1439 	!
   1440 1:
   1441 	stb	%o4, [%i1]		! write to address
   1442 	inc	%i1			! inc to address
   1443 .dbytecp:
   1444 	deccc	%i2			! dec count
   1445 	bgeu,a	%ncc, 1b		! loop till done
   1446 	ldub	[%i0+%i1], %o4		! read from address
   1447 .cpdone:
   1448 #if !defined(NIAGARA_IMPL)
   1449 	! FPUSED_FLAG will not have been set in any path leading to
   1450 	! this point. No need to deal with it.
   1451 	btst	BCOPY_FLAG, %o5
   1452 	bz,pn	%icc, 2f
   1453 	andncc	%o5, BCOPY_FLAG, %o5
   1454 	! Here via bcopy. Check to see if the handler was NULL.
   1455 	! If so, just return quietly. Otherwise, reset the
   1456 	! handler and go home.
   1457 	bnz,pn	%ncc, 2f
   1458 	nop
   1459 	!
   1460 	! Null handler.
   1461 	!
   1462 	ret
   1463 	restore %g0, 0, %o0
   1464 	! Here via kcopy or bcopy with a handler.
   1465 	! Reset the fault handler.
   1466 2:
   1467 	membar	#Sync
   1468 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   1469 #else	/* NIAGARA_IMPL */
   1470 	membar	#Sync				! sync error barrier
   1471 	! Restore t_lofault handler, if came here from kcopy().
   1472 	tst	%o5
   1473 	bz	%ncc, 1f
   1474 	andn	%o5, LOFAULT_SET, %o5
   1475 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   1476 1:
   1477 #endif	/* NIAGARA_IMPL */
   1478 	ret
   1479 	restore %g0, 0, %o0		! return (0)
   1480 
   1481 /*
   1482  * Common code used to align transfers on word and doubleword
   1483  * boudaries.  Aligns source and destination and returns a count
   1484  * of aligned bytes to transfer in %i3
   1485  */
   1486 1:
   1487 	inc	%i0			! inc from
   1488 	stb	%o4, [%i1]		! write a byte
   1489 	inc	%i1			! inc to
   1490 	dec	%i2			! dec count
   1491 .alignit:
   1492 	btst	%o0, %i0		! %o0 is bit mask to check for alignment
   1493 	bnz,a	1b
   1494 	ldub	[%i0], %o4		! read next byte
   1495 
   1496 	retl
   1497 	andn	%i2, %o0, %i3		! return size of aligned bytes
   1498 	SET_SIZE(bcopy)
   1499 
   1500 #endif	/* lint */
   1501 
   1502 /*
   1503  * Block copy with possibly overlapped operands.
   1504  */
   1505 
   1506 #if defined(lint)
   1507 
   1508 /*ARGSUSED*/
   1509 void
   1510 ovbcopy(const void *from, void *to, size_t count)
   1511 {}
   1512 
   1513 #else	/* lint */
   1514 
   1515 	ENTRY(ovbcopy)
   1516 	tst	%o2			! check count
   1517 	bgu,a	%ncc, 1f		! nothing to do or bad arguments
   1518 	subcc	%o0, %o1, %o3		! difference of from and to address
   1519 
   1520 	retl				! return
   1521 	nop
   1522 1:
   1523 	bneg,a	%ncc, 2f
   1524 	neg	%o3			! if < 0, make it positive
   1525 2:	cmp	%o2, %o3		! cmp size and abs(from - to)
   1526 	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
   1527 	.empty				!   no overlap
   1528 	cmp	%o0, %o1		! compare from and to addresses
   1529 	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
   1530 	nop
   1531 	!
   1532 	! Copy forwards.
   1533 	!
   1534 .ov_fwd:
   1535 	ldub	[%o0], %o3		! read from address
   1536 	inc	%o0			! inc from address
   1537 	stb	%o3, [%o1]		! write to address
   1538 	deccc	%o2			! dec count
   1539 	bgu	%ncc, .ov_fwd		! loop till done
   1540 	inc	%o1			! inc to address
   1541 
   1542 	retl				! return
   1543 	nop
   1544 	!
   1545 	! Copy backwards.
   1546 	!
   1547 .ov_bkwd:
   1548 	deccc	%o2			! dec count
   1549 	ldub	[%o0 + %o2], %o3	! get byte at end of src
   1550 	bgu	%ncc, .ov_bkwd		! loop till done
   1551 	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
   1552 
   1553 	retl				! return
   1554 	nop
   1555 	SET_SIZE(ovbcopy)
   1556 
   1557 #endif	/* lint */
   1558 
   1559 /*
   1560  * hwblkpagecopy()
   1561  *
   1562  * Copies exactly one page.  This routine assumes the caller (ppcopy)
   1563  * has already disabled kernel preemption and has checked
   1564  * use_hw_bcopy.
   1565  */
   1566 #ifdef lint
   1567 /*ARGSUSED*/
   1568 void
   1569 hwblkpagecopy(const void *src, void *dst)
   1570 { }
   1571 #else /* lint */
   1572 	ENTRY(hwblkpagecopy)
   1573 	save	%sp, -SA(MINFRAME), %sp
   1574 
   1575 	! %i0 - source address (arg)
   1576 	! %i1 - destination address (arg)
   1577 	! %i2 - length of region (not arg)
   1578 
   1579 	set	PAGESIZE, %i2
   1580 
   1581 	/*
   1582 	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
   1583 	 */
   1584 	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
   1585 	prefetch [%i0+0x0], #one_read
   1586 	prefetch [%i0+0x40], #one_read
   1587 1:
   1588 	prefetch [%i0+0x80], #one_read
   1589 	prefetch [%i0+0xc0], #one_read
   1590 	ldda	[%i0+0x0]%asi, %l0
   1591 	ldda	[%i0+0x10]%asi, %l2
   1592 	ldda	[%i0+0x20]%asi, %l4
   1593 	ldda	[%i0+0x30]%asi, %l6
   1594 	stxa	%l0, [%i1+0x0]%asi
   1595 	stxa	%l1, [%i1+0x8]%asi
   1596 	stxa	%l2, [%i1+0x10]%asi
   1597 	stxa	%l3, [%i1+0x18]%asi
   1598 	stxa	%l4, [%i1+0x20]%asi
   1599 	stxa	%l5, [%i1+0x28]%asi
   1600 	stxa	%l6, [%i1+0x30]%asi
   1601 	stxa	%l7, [%i1+0x38]%asi
   1602 	ldda	[%i0+0x40]%asi, %l0
   1603 	ldda	[%i0+0x50]%asi, %l2
   1604 	ldda	[%i0+0x60]%asi, %l4
   1605 	ldda	[%i0+0x70]%asi, %l6
   1606 	stxa	%l0, [%i1+0x40]%asi
   1607 	stxa	%l1, [%i1+0x48]%asi
   1608 	stxa	%l2, [%i1+0x50]%asi
   1609 	stxa	%l3, [%i1+0x58]%asi
   1610 	stxa	%l4, [%i1+0x60]%asi
   1611 	stxa	%l5, [%i1+0x68]%asi
   1612 	stxa	%l6, [%i1+0x70]%asi
   1613 	stxa	%l7, [%i1+0x78]%asi
   1614 
   1615 	add	%i0, 0x80, %i0
   1616 	subcc	%i2, 0x80, %i2
   1617 	bgu,pt	%xcc, 1b
   1618 	add	%i1, 0x80, %i1
   1619 
   1620 	membar #Sync
   1621 	ret
   1622 	restore	%g0, 0, %o0
   1623 	SET_SIZE(hwblkpagecopy)
   1624 #endif	/* lint */
   1625 
   1626 
   1627 /*
   1628  * Transfer data to and from user space -
   1629  * Note that these routines can cause faults
   1630  * It is assumed that the kernel has nothing at
   1631  * less than KERNELBASE in the virtual address space.
   1632  *
   1633  * Note that copyin(9F) and copyout(9F) are part of the
   1634  * DDI/DKI which specifies that they return '-1' on "errors."
   1635  *
   1636  * Sigh.
   1637  *
   1638  * So there's two extremely similar routines - xcopyin() and xcopyout()
   1639  * which return the errno that we've faithfully computed.  This
   1640  * allows other callers (e.g. uiomove(9F)) to work correctly.
   1641  * Given that these are used pretty heavily, we expand the calling
   1642  * sequences inline for all flavours (rather than making wrappers).
   1643  *
   1644  * There are also stub routines for xcopyout_little and xcopyin_little,
   1645  * which currently are intended to handle requests of <= 16 bytes from
   1646  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
   1647  * is left as an exercise...
   1648  */
   1649 
   1650 /*
   1651  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
   1652  *
   1653  * General theory of operation:
   1654  *
   1655  * None of the copyops routines grab a window until it's decided that
   1656  * we need to do a HW block copy operation. This saves a window
   1657  * spill/fill when we're called during socket ops. The typical IO
   1658  * path won't cause spill/fill traps.
   1659  *
   1660  * This code uses a set of 4 limits for the maximum size that will
   1661  * be copied given a particular input/output address alignment.
   1662  * the default limits are:
   1663  *
   1664  * single byte aligned - 256 (hw_copy_limit_1)
   1665  * two byte aligned - 512 (hw_copy_limit_2)
   1666  * four byte aligned - 1024 (hw_copy_limit_4)
   1667  * eight byte aligned - 1024 (hw_copy_limit_8)
   1668  *
   1669  * If the value for a particular limit is zero, the copy will be done
   1670  * via the copy loops rather than block store/quad load instructions.
   1671  *
   1672  * Flow:
   1673  *
   1674  * If count == zero return zero.
   1675  *
   1676  * Store the previous lo_fault handler into %g6.
   1677  * Place our secondary lofault handler into %g5.
   1678  * Place the address of our nowindow fault handler into %o3.
   1679  * Place the address of the windowed fault handler into %o4.
   1680  * --> We'll use this handler if we end up grabbing a window
   1681  * --> before we use block initializing store and quad load ASIs
   1682  *
   1683  * If count is less than or equal to SMALL_LIMIT (7) we
   1684  * always do a byte for byte copy.
   1685  *
   1686  * If count is > SMALL_LIMIT, we check the alignment of the input
   1687  * and output pointers. Based on the alignment we check count
   1688  * against a limit based on detected alignment.  If we exceed the
   1689  * alignment value we copy via block initializing store and quad
   1690  * load instructions.
   1691  *
   1692  * If we don't exceed one of the limits, we store -count in %o3,
   1693  * we store the number of chunks (8, 4, 2 or 1 byte) operated
   1694  * on in our basic copy loop in %o2. Following this we branch
   1695  * to the appropriate copy loop and copy that many chunks.
   1696  * Since we've been adding the chunk size to %o3 each time through
   1697  * as well as decrementing %o2, we can tell if any data is
   1698  * is left to be copied by examining %o3. If that is zero, we're
   1699  * done and can go home. If not, we figure out what the largest
   1700  * chunk size left to be copied is and branch to that copy loop
   1701  * unless there's only one byte left. We load that as we're
   1702  * branching to code that stores it just before we return.
   1703  *
   1704  * Fault handlers are invoked if we reference memory that has no
   1705  * current mapping.  All forms share the same copyio_fault handler.
   1706  * This routine handles fixing up the stack and general housecleaning.
   1707  * Each copy operation has a simple fault handler that is then called
   1708  * to do the work specific to the invidual operation.  The handler
   1709  * for copyOP and xcopyOP are found at the end of individual function.
   1710  * The handlers for xcopyOP_little are found at the end of xcopyin_little.
   1711  * The handlers for copyOP_noerr are found at the end of copyin_noerr.
   1712  */
   1713 
   1714 /*
   1715  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
   1716  */
   1717 
   1718 #if defined(lint)
   1719 
   1720 /*ARGSUSED*/
   1721 int
   1722 copyout(const void *kaddr, void *uaddr, size_t count)
   1723 { return (0); }
   1724 
   1725 #else	/* lint */
   1726 
   1727 /*
   1728  * We save the arguments in the following registers in case of a fault:
   1729  * 	kaddr - %g2
   1730  * 	uaddr - %g3
   1731  * 	count - %g4
   1732  */
   1733 #define	SAVE_SRC	%g2
   1734 #define	SAVE_DST	%g3
   1735 #define	SAVE_COUNT	%g4
   1736 
   1737 #define	REAL_LOFAULT		%g5
   1738 #define	SAVED_LOFAULT		%g6
   1739 
   1740 /*
   1741  * Generic copyio fault handler.  This is the first line of defense when a
   1742  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
   1743  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
   1744  * This allows us to share common code for all the flavors of the copy
   1745  * operations, including the _noerr versions.
   1746  *
   1747  * Note that this function will restore the original input parameters before
   1748  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
   1749  * member of the t_copyop structure, if needed.
   1750  */
   1751 	ENTRY(copyio_fault)
   1752 #if !defined(NIAGARA_IMPL)
   1753 	btst	FPUSED_FLAG, SAVED_LOFAULT
   1754 	bz	1f
   1755 	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
   1756 
   1757 	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
   1758 	wr	%o2, 0, %gsr		! restore gsr
   1759 
   1760 	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
   1761 	btst	FPRS_FEF, %o3
   1762 	bz	%icc, 4f
   1763 	  nop
   1764 
   1765 	! restore fpregs from stack
   1766 	BLD_FP_FROMSTACK(%o2)
   1767 
   1768 	ba,pt	%ncc, 1f
   1769 	  wr	%o3, 0, %fprs		! restore fprs
   1770 
   1771 4:
   1772 	FZERO				! zero all of the fpregs
   1773 	wr	%o3, 0, %fprs		! restore fprs
   1774 
   1775 1:
   1776 #else	/* NIAGARA_IMPL */
   1777 	membar	#Sync
   1778 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   1779 #endif	/* NIAGARA_IMPL */
   1780 
   1781 	restore
   1782 
   1783 	mov	SAVE_SRC, %o0
   1784 	mov	SAVE_DST, %o1
   1785 	jmp	REAL_LOFAULT
   1786 	  mov	SAVE_COUNT, %o2
   1787 	SET_SIZE(copyio_fault)
   1788 
   1789 	ENTRY(copyio_fault_nowindow)
   1790 	membar	#Sync
   1791 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   1792 
   1793 	mov	SAVE_SRC, %o0
   1794 	mov	SAVE_DST, %o1
   1795 	jmp	REAL_LOFAULT
   1796 	  mov	SAVE_COUNT, %o2
   1797 	SET_SIZE(copyio_fault_nowindow)
   1798 
   1799 	ENTRY(copyout)
   1800 	sethi	%hi(.copyout_err), REAL_LOFAULT
   1801 	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
   1802 
   1803 .do_copyout:
   1804 	!
   1805 	! Check the length and bail if zero.
   1806 	!
   1807 	tst	%o2
   1808 	bnz,pt	%ncc, 1f
   1809 	  nop
   1810 	retl
   1811 	  clr	%o0
   1812 1:
   1813 	sethi	%hi(copyio_fault), %o4
   1814 	or	%o4, %lo(copyio_fault), %o4
   1815 	sethi	%hi(copyio_fault_nowindow), %o3
   1816 	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
   1817 	or	%o3, %lo(copyio_fault_nowindow), %o3
   1818 	membar	#Sync
   1819 	stn	%o3, [THREAD_REG + T_LOFAULT]
   1820 
   1821 	mov	%o0, SAVE_SRC
   1822 	mov	%o1, SAVE_DST
   1823 	mov	%o2, SAVE_COUNT
   1824 
   1825 	!
   1826 	! Check to see if we're more than SMALL_LIMIT (7 bytes).
   1827 	! Run in leaf mode, using the %o regs as our input regs.
   1828 	!
   1829 	subcc	%o2, SMALL_LIMIT, %o3
   1830 	bgu,a,pt %ncc, .dco_ns
   1831 	or	%o0, %o1, %o3
   1832 	!
   1833 	! What was previously ".small_copyout"
   1834 	! Do full differenced copy.
   1835 	!
   1836 .dcobcp:
   1837 	sub	%g0, %o2, %o3		! negate count
   1838 	add	%o0, %o2, %o0		! make %o0 point at the end
   1839 	add	%o1, %o2, %o1		! make %o1 point at the end
   1840 	ba,pt	%ncc, .dcocl
   1841 	ldub	[%o0 + %o3], %o4	! load first byte
   1842 	!
   1843 	! %o0 and %o2 point at the end and remain pointing at the end
   1844 	! of their buffers. We pull things out by adding %o3 (which is
   1845 	! the negation of the length) to the buffer end which gives us
   1846 	! the curent location in the buffers. By incrementing %o3 we walk
   1847 	! through both buffers without having to bump each buffer's
   1848 	! pointer. A very fast 4 instruction loop.
   1849 	!
   1850 	.align 16
   1851 .dcocl:
   1852 	stba	%o4, [%o1 + %o3]ASI_USER
   1853 	inccc	%o3
   1854 	bl,a,pt	%ncc, .dcocl
   1855 	ldub	[%o0 + %o3], %o4
   1856 	!
   1857 	! We're done. Go home.
   1858 	!
   1859 	membar	#Sync
   1860 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
   1861 	retl
   1862 	clr	%o0
   1863 	!
   1864 	! Try aligned copies from here.
   1865 	!
   1866 .dco_ns:
   1867 	! %o0 = kernel addr (to be copied from)
   1868 	! %o1 = user addr (to be copied to)
   1869 	! %o2 = length
   1870 	! %o3 = %o1 | %o2 (used for alignment checking)
   1871 	! %o4 is alternate lo_fault
   1872 	! %o5 is original lo_fault
   1873 	!
   1874 	! See if we're single byte aligned. If we are, check the
   1875 	! limit for single byte copies. If we're smaller or equal,
   1876 	! bounce to the byte for byte copy loop. Otherwise do it in
   1877 	! HW (if enabled).
   1878 	!
   1879 	btst	1, %o3
   1880 	bz,pt	%icc, .dcoh8
   1881 	btst	7, %o3
   1882 	!
   1883 	! Single byte aligned. Do we do it via HW or via
   1884 	! byte for byte? Do a quick no memory reference
   1885 	! check to pick up small copies.
   1886 	!
   1887 	sethi	%hi(hw_copy_limit_1), %o3
   1888 	!
   1889 	! Big enough that we need to check the HW limit for
   1890 	! this size copy.
   1891 	!
   1892 	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
   1893 	!
   1894 	! Is HW copy on? If not, do everything byte for byte.
   1895 	!
   1896 	tst	%o3
   1897 	bz,pn	%icc, .dcobcp
   1898 	subcc	%o3, %o2, %o3
   1899 	!
   1900 	! If we're less than or equal to the single byte copy limit,
   1901 	! bop to the copy loop.
   1902 	!
   1903 	bge,pt	%ncc, .dcobcp
   1904 	nop
   1905 	!
   1906 	! We're big enough and copy is on. Do it with HW.
   1907 	!
   1908 	ba,pt	%ncc, .big_copyout
   1909 	nop
   1910 .dcoh8:
   1911 	!
   1912 	! 8 byte aligned?
   1913 	!
   1914 	bnz,a	%ncc, .dcoh4
   1915 	btst	3, %o3
   1916 	!
   1917 	! See if we're in the "small range".
   1918 	! If so, go off and do the copy.
   1919 	! If not, load the hard limit. %o3 is
   1920 	! available for reuse.
   1921 	!
   1922 	sethi	%hi(hw_copy_limit_8), %o3
   1923 	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
   1924 	!
   1925 	! If it's zero, there's no HW bcopy.
   1926 	! Bop off to the aligned copy.
   1927 	!
   1928 	tst	%o3
   1929 	bz,pn	%icc, .dcos8
   1930 	subcc	%o3, %o2, %o3
   1931 	!
   1932 	! We're negative if our size is larger than hw_copy_limit_8.
   1933 	!
   1934 	bge,pt	%ncc, .dcos8
   1935 	nop
   1936 	!
   1937 	! HW assist is on and we're large enough. Do it.
   1938 	!
   1939 	ba,pt	%ncc, .big_copyout
   1940 	nop
   1941 .dcos8:
   1942 	!
   1943 	! Housekeeping for copy loops. Uses same idea as in the byte for
   1944 	! byte copy loop above.
   1945 	!
   1946 	add	%o0, %o2, %o0
   1947 	add	%o1, %o2, %o1
   1948 	sub	%g0, %o2, %o3
   1949 	ba,pt	%ncc, .dodebc
   1950 	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
   1951 	!
   1952 	! 4 byte aligned?
   1953 	!
   1954 .dcoh4:
   1955 	bnz,pn	%ncc, .dcoh2
   1956 	!
   1957 	! See if we're in the "small range".
   1958 	! If so, go off an do the copy.
   1959 	! If not, load the hard limit. %o3 is
   1960 	! available for reuse.
   1961 	!
   1962 	sethi	%hi(hw_copy_limit_4), %o3
   1963 	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
   1964 	!
   1965 	! If it's zero, there's no HW bcopy.
   1966 	! Bop off to the aligned copy.
   1967 	!
   1968 	tst	%o3
   1969 	bz,pn	%icc, .dcos4
   1970 	subcc	%o3, %o2, %o3
   1971 	!
   1972 	! We're negative if our size is larger than hw_copy_limit_4.
   1973 	!
   1974 	bge,pt	%ncc, .dcos4
   1975 	nop
   1976 	!
   1977 	! HW assist is on and we're large enough. Do it.
   1978 	!
   1979 	ba,pt	%ncc, .big_copyout
   1980 	nop
   1981 .dcos4:
   1982 	add	%o0, %o2, %o0
   1983 	add	%o1, %o2, %o1
   1984 	sub	%g0, %o2, %o3
   1985 	ba,pt	%ncc, .dodfbc
   1986 	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
   1987 	!
   1988 	! We must be 2 byte aligned. Off we go.
   1989 	! The check for small copies was done in the
   1990 	! delay at .dcoh4
   1991 	!
   1992 .dcoh2:
   1993 	ble	%ncc, .dcos2
   1994 	sethi	%hi(hw_copy_limit_2), %o3
   1995 	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
   1996 	tst	%o3
   1997 	bz,pn	%icc, .dcos2
   1998 	subcc	%o3, %o2, %o3
   1999 	bge,pt	%ncc, .dcos2
   2000 	nop
   2001 	!
   2002 	! HW is on and we're big enough. Do it.
   2003 	!
   2004 	ba,pt	%ncc, .big_copyout
   2005 	nop
   2006 .dcos2:
   2007 	add	%o0, %o2, %o0
   2008 	add	%o1, %o2, %o1
   2009 	sub	%g0, %o2, %o3
   2010 	ba,pt	%ncc, .dodtbc
   2011 	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
   2012 .small_copyout:
   2013 	!
   2014 	! Why are we doing this AGAIN? There are certain conditions in
   2015 	! big_copyout that will cause us to forego the HW assisted copies
   2016 	! and bounce back to a non-HW assisted copy. This dispatches those
   2017 	! copies. Note that we branch around this in the main line code.
   2018 	!
   2019 	! We make no check for limits or HW enablement here. We've
   2020 	! already been told that we're a poster child so just go off
   2021 	! and do it.
   2022 	!
   2023 	or	%o0, %o1, %o3
   2024 	btst	1, %o3
   2025 	bnz	%icc, .dcobcp		! Most likely
   2026 	btst	7, %o3
   2027 	bz	%icc, .dcos8
   2028 	btst	3, %o3
   2029 	bz	%icc, .dcos4
   2030 	nop
   2031 	ba,pt	%ncc, .dcos2
   2032 	nop
   2033 	.align 32
   2034 .dodebc:
   2035 	ldx	[%o0 + %o3], %o4
   2036 	deccc	%o2
   2037 	stxa	%o4, [%o1 + %o3]ASI_USER
   2038 	bg,pt	%ncc, .dodebc
   2039 	addcc	%o3, 8, %o3
   2040 	!
   2041 	! End of copy loop. Check to see if we're done. Most
   2042 	! eight byte aligned copies end here.
   2043 	!
   2044 	bz,pt	%ncc, .dcofh
   2045 	nop
   2046 	!
   2047 	! Something is left - do it byte for byte.
   2048 	!
   2049 	ba,pt	%ncc, .dcocl
   2050 	ldub	[%o0 + %o3], %o4	! load next byte
   2051 	!
   2052 	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
   2053 	!
   2054 	.align 32
   2055 .dodfbc:
   2056 	lduw	[%o0 + %o3], %o4
   2057 	deccc	%o2
   2058 	sta	%o4, [%o1 + %o3]ASI_USER
   2059 	bg,pt	%ncc, .dodfbc
   2060 	addcc	%o3, 4, %o3
   2061 	!
   2062 	! End of copy loop. Check to see if we're done. Most
   2063 	! four byte aligned copies end here.
   2064 	!
   2065 	bz,pt	%ncc, .dcofh
   2066 	nop
   2067 	!
   2068 	! Something is left. Do it byte for byte.
   2069 	!
   2070 	ba,pt	%ncc, .dcocl
   2071 	ldub	[%o0 + %o3], %o4	! load next byte
   2072 	!
   2073 	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
   2074 	! copy.
   2075 	!
   2076 	.align 32
   2077 .dodtbc:
   2078 	lduh	[%o0 + %o3], %o4
   2079 	deccc	%o2
   2080 	stha	%o4, [%o1 + %o3]ASI_USER
   2081 	bg,pt	%ncc, .dodtbc
   2082 	addcc	%o3, 2, %o3
   2083 	!
   2084 	! End of copy loop. Anything left?
   2085 	!
   2086 	bz,pt	%ncc, .dcofh
   2087 	nop
   2088 	!
   2089 	! Deal with the last byte
   2090 	!
   2091 	ldub	[%o0 + %o3], %o4
   2092 	stba	%o4, [%o1 + %o3]ASI_USER
   2093 .dcofh:
   2094 	membar	#Sync
   2095 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   2096 	retl
   2097 	clr	%o0
   2098 
   2099 .big_copyout:
   2100 	! We're going to go off and do a block copy.
   2101 	! Switch fault handlers and grab a window. We
   2102 	! don't do a membar #Sync since we've done only
   2103 	! kernel data to this point.
   2104 	stn	%o4, [THREAD_REG + T_LOFAULT]
   2105 
   2106 	! Copy out that reach here are larger than 256 bytes. The
   2107 	! hw_copy_limit_1 is set to 256. Never set this limit less
   2108 	! 128 bytes.
   2109 #if !defined(NIAGARA_IMPL)
   2110 	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
   2111 
   2112 	rd	%fprs, %o2			! check for unused fp
   2113 	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
   2114 	btst	FPRS_FEF, %o2
   2115 	bz,a,pt	%icc, .do_block_copyout
   2116 	wr	%g0, FPRS_FEF, %fprs
   2117 
   2118 	! save in-use fpregs on stack
   2119 	BST_FP_TOSTACK(%o2)
   2120 #else	/* NIAGARA_IMPL */
   2121 	save	%sp, -SA(MINFRAME), %sp
   2122 #endif	/* NIAGARA_IMPL */
   2123 
   2124 .do_block_copyout:
   2125 
   2126 #if !defined(NIAGARA_IMPL)
   2127 	rd	%gsr, %o2
   2128 	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
   2129 	! set the lower bit saved t_lofault to indicate that we need
   2130 	! clear %fprs register on the way out
   2131 	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
   2132 #endif	/* NIAGARA_IMPL */
   2133 
   2134 	! Swap src/dst since the code below is memcpy code
   2135 	! and memcpy/bcopy have different calling sequences
   2136 	mov	%i1, %i5
   2137 	mov	%i0, %i1
   2138 	mov	%i5, %i0
   2139 
   2140 	! Block (64 bytes) align the destination.
   2141 	andcc	%i0, 0x3f, %i3		! is dst block aligned
   2142 	bz	%ncc, copyout_blalign	! dst already block aligned
   2143 	sub	%i3, 0x40, %i3
   2144 	neg	%i3			! bytes till dst 64 bytes aligned
   2145 	sub	%i2, %i3, %i2		! update i2 with new count
   2146 
   2147 	! Based on source and destination alignment do
   2148 	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
   2149 
   2150 	! Is dst & src 8B aligned
   2151 	or	%i0, %i1, %o2
   2152 	andcc	%o2, 0x7, %g0
   2153 	bz	%ncc, .co_alewdcp
   2154 	nop
   2155 
   2156 	! Is dst & src 4B aligned
   2157 	andcc	%o2, 0x3, %g0
   2158 	bz	%ncc, .co_alwdcp
   2159 	nop
   2160 
   2161 	! Is dst & src 2B aligned
   2162 	andcc	%o2, 0x1, %g0
   2163 	bz	%ncc, .co_alhlfwdcp
   2164 	nop
   2165 
   2166 	! 1B aligned
   2167 1:	ldub	[%i1], %o2
   2168 	stba	%o2, [%i0]ASI_USER
   2169 	inc	%i1
   2170 	deccc	%i3
   2171 	bgu,pt	%ncc, 1b
   2172 	inc	%i0
   2173 
   2174 	ba	copyout_blalign
   2175 	nop
   2176 
   2177 	! dst & src 4B aligned
   2178 .co_alwdcp:
   2179 	ld	[%i1], %o2
   2180 	sta	%o2, [%i0]ASI_USER
   2181 	add	%i1, 0x4, %i1
   2182 	subcc	%i3, 0x4, %i3
   2183 	bgu,pt	%ncc, .co_alwdcp
   2184 	add	%i0, 0x4, %i0
   2185 
   2186 	ba	copyout_blalign
   2187 	nop
   2188 
   2189 	! dst & src 2B aligned
   2190 .co_alhlfwdcp:
   2191 	lduh	[%i1], %o2
   2192 	stuha	%o2, [%i0]ASI_USER
   2193 	add	%i1, 0x2, %i1
   2194 	subcc	%i3, 0x2, %i3
   2195 	bgu,pt	%ncc, .co_alhlfwdcp
   2196 	add	%i0, 0x2, %i0
   2197 
   2198 	ba	copyout_blalign
   2199 	nop
   2200 
   2201 	! dst & src 8B aligned
   2202 .co_alewdcp:
   2203 	ldx	[%i1], %o2
   2204 	stxa	%o2, [%i0]ASI_USER
   2205 	add	%i1, 0x8, %i1
   2206 	subcc	%i3, 0x8, %i3
   2207 	bgu,pt	%ncc, .co_alewdcp
   2208 	add	%i0, 0x8, %i0
   2209 
   2210 	! Now Destination is block (64 bytes) aligned
   2211 copyout_blalign:
   2212 	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
   2213 	sub	%i2, %i3, %i2		! Residue bytes in %i2
   2214 
   2215 	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
   2216 
   2217 #if !defined(NIAGARA_IMPL)
   2218 	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
   2219 	prefetch [%l0+0x0], #one_read
   2220 	andcc	%i1, 0x3f, %g0		! is src 64B aligned
   2221 	bz,pn	%ncc, .co_blkcpy
   2222 	nop
   2223 
   2224 	! handle misaligned source cases
   2225 	alignaddr %i1, %g0, %g0		! generate %gsr
   2226 
   2227 	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
   2228 					! significant in %l1
   2229 	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
   2230 	add	%i1, %i3, %i1
   2231 
   2232 	! switch statement to get to right 8 byte block within
   2233 	! 64 byte block
   2234 	cmp	 %l2, 0x4
   2235 	bgeu,a	 co_hlf
   2236 	cmp	 %l2, 0x6
   2237 	cmp	 %l2, 0x2
   2238 	bgeu,a	 co_sqtr
   2239 	nop
   2240 	cmp	 %l2, 0x1
   2241 	be,a	 co_off15
   2242 	nop
   2243 	ba	 co_off7
   2244 	nop
   2245 co_sqtr:
   2246 	be,a	 co_off23
   2247 	nop
   2248 	ba,a	 co_off31
   2249 	nop
   2250 
   2251 co_hlf:
   2252 	bgeu,a	 co_fqtr
   2253 	nop
   2254 	cmp	 %l2, 0x5
   2255 	be,a	 co_off47
   2256 	nop
   2257 	ba	 co_off39
   2258 	nop
   2259 co_fqtr:
   2260 	be,a	 co_off55
   2261 	nop
   2262 
   2263 	ldd	[%l0+0x38], %d14
   2264 	prefetch [%l0+0x40], #one_read
   2265 	prefetch [%l0+0x80], #one_read
   2266 7:
   2267 	add	%l0, 0x40, %l0
   2268 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2269 
   2270 	ldda	[%l0]ASI_BLK_P, %d16
   2271 	ALIGN_OFF_56_63
   2272 	fsrc1	%d30, %d14
   2273 
   2274 	stda	%d48, [%i0]ASI_BLK_AIUS
   2275 	subcc	%i3, 0x40, %i3
   2276 	add	%i0, 0x40, %i0
   2277 	bgu,pt	%ncc, 7b
   2278 	prefetch [%l0+0x80], #one_read
   2279 	ba	.co_blkdone
   2280 	membar	#Sync
   2281 
   2282 co_off7:
   2283 	ldda	[%l0]ASI_BLK_P, %d0
   2284 	prefetch [%l0+0x40], #one_read
   2285 	prefetch [%l0+0x80], #one_read
   2286 0:
   2287 	add	%l0, 0x40, %l0
   2288 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2289 
   2290 	ldda	[%l0]ASI_BLK_P, %d16
   2291 	ALIGN_OFF_1_7
   2292 	fsrc1	%d16, %d0
   2293 	fsrc1	%d18, %d2
   2294 	fsrc1	%d20, %d4
   2295 	fsrc1	%d22, %d6
   2296 	fsrc1	%d24, %d8
   2297 	fsrc1	%d26, %d10
   2298 	fsrc1	%d28, %d12
   2299 	fsrc1	%d30, %d14
   2300 
   2301 	stda	%d48, [%i0]ASI_BLK_AIUS
   2302 	subcc	%i3, 0x40, %i3
   2303 	add	%i0, 0x40, %i0
   2304 	bgu,pt	%ncc, 0b
   2305 	prefetch [%l0+0x80], #one_read
   2306 	ba	.co_blkdone
   2307 	membar	#Sync
   2308 
   2309 co_off15:
   2310 	ldd	[%l0+0x8], %d2
   2311 	ldd	[%l0+0x10], %d4
   2312 	ldd	[%l0+0x18], %d6
   2313 	ldd	[%l0+0x20], %d8
   2314 	ldd	[%l0+0x28], %d10
   2315 	ldd	[%l0+0x30], %d12
   2316 	ldd	[%l0+0x38], %d14
   2317 	prefetch [%l0+0x40], #one_read
   2318 	prefetch [%l0+0x80], #one_read
   2319 1:
   2320 	add	%l0, 0x40, %l0
   2321 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2322 
   2323 	ldda	[%l0]ASI_BLK_P, %d16
   2324 	ALIGN_OFF_8_15
   2325 	fsrc1	%d18, %d2
   2326 	fsrc1	%d20, %d4
   2327 	fsrc1	%d22, %d6
   2328 	fsrc1	%d24, %d8
   2329 	fsrc1	%d26, %d10
   2330 	fsrc1	%d28, %d12
   2331 	fsrc1	%d30, %d14
   2332 
   2333 	stda	%d48, [%i0]ASI_BLK_AIUS
   2334 	subcc	%i3, 0x40, %i3
   2335 	add	%i0, 0x40, %i0
   2336 	bgu,pt	%ncc, 1b
   2337 	prefetch [%l0+0x80], #one_read
   2338 	ba	.co_blkdone
   2339 	membar	#Sync
   2340 
   2341 co_off23:
   2342 	ldd	[%l0+0x10], %d4
   2343 	ldd	[%l0+0x18], %d6
   2344 	ldd	[%l0+0x20], %d8
   2345 	ldd	[%l0+0x28], %d10
   2346 	ldd	[%l0+0x30], %d12
   2347 	ldd	[%l0+0x38], %d14
   2348 	prefetch [%l0+0x40], #one_read
   2349 	prefetch [%l0+0x80], #one_read
   2350 2:
   2351 	add	%l0, 0x40, %l0
   2352 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2353 
   2354 	ldda	[%l0]ASI_BLK_P, %d16
   2355 	ALIGN_OFF_16_23
   2356 	fsrc1	%d20, %d4
   2357 	fsrc1	%d22, %d6
   2358 	fsrc1	%d24, %d8
   2359 	fsrc1	%d26, %d10
   2360 	fsrc1	%d28, %d12
   2361 	fsrc1	%d30, %d14
   2362 
   2363 	stda	%d48, [%i0]ASI_BLK_AIUS
   2364 	subcc	%i3, 0x40, %i3
   2365 	add	%i0, 0x40, %i0
   2366 	bgu,pt	%ncc, 2b
   2367 	prefetch [%l0+0x80], #one_read
   2368 	ba	.co_blkdone
   2369 	membar	#Sync
   2370 
   2371 co_off31:
   2372 	ldd	[%l0+0x18], %d6
   2373 	ldd	[%l0+0x20], %d8
   2374 	ldd	[%l0+0x28], %d10
   2375 	ldd	[%l0+0x30], %d12
   2376 	ldd	[%l0+0x38], %d14
   2377 	prefetch [%l0+0x40], #one_read
   2378 	prefetch [%l0+0x80], #one_read
   2379 3:
   2380 	add	%l0, 0x40, %l0
   2381 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2382 
   2383 	ldda	[%l0]ASI_BLK_P, %d16
   2384 	ALIGN_OFF_24_31
   2385 	fsrc1	%d22, %d6
   2386 	fsrc1	%d24, %d8
   2387 	fsrc1	%d26, %d10
   2388 	fsrc1	%d28, %d12
   2389 	fsrc1	%d30, %d14
   2390 
   2391 	stda	%d48, [%i0]ASI_BLK_AIUS
   2392 	subcc	%i3, 0x40, %i3
   2393 	add	%i0, 0x40, %i0
   2394 	bgu,pt	%ncc, 3b
   2395 	prefetch [%l0+0x80], #one_read
   2396 	ba	.co_blkdone
   2397 	membar	#Sync
   2398 
   2399 co_off39:
   2400 	ldd	[%l0+0x20], %d8
   2401 	ldd	[%l0+0x28], %d10
   2402 	ldd	[%l0+0x30], %d12
   2403 	ldd	[%l0+0x38], %d14
   2404 	prefetch [%l0+0x40], #one_read
   2405 	prefetch [%l0+0x80], #one_read
   2406 4:
   2407 	add	%l0, 0x40, %l0
   2408 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2409 
   2410 	ldda	[%l0]ASI_BLK_P, %d16
   2411 	ALIGN_OFF_32_39
   2412 	fsrc1	%d24, %d8
   2413 	fsrc1	%d26, %d10
   2414 	fsrc1	%d28, %d12
   2415 	fsrc1	%d30, %d14
   2416 
   2417 	stda	%d48, [%i0]ASI_BLK_AIUS
   2418 	subcc	%i3, 0x40, %i3
   2419 	add	%i0, 0x40, %i0
   2420 	bgu,pt	%ncc, 4b
   2421 	prefetch [%l0+0x80], #one_read
   2422 	ba	.co_blkdone
   2423 	membar	#Sync
   2424 
   2425 co_off47:
   2426 	ldd	[%l0+0x28], %d10
   2427 	ldd	[%l0+0x30], %d12
   2428 	ldd	[%l0+0x38], %d14
   2429 	prefetch [%l0+0x40], #one_read
   2430 	prefetch [%l0+0x80], #one_read
   2431 5:
   2432 	add	%l0, 0x40, %l0
   2433 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2434 
   2435 	ldda	[%l0]ASI_BLK_P, %d16
   2436 	ALIGN_OFF_40_47
   2437 	fsrc1	%d26, %d10
   2438 	fsrc1	%d28, %d12
   2439 	fsrc1	%d30, %d14
   2440 
   2441 	stda	%d48, [%i0]ASI_BLK_AIUS
   2442 	subcc	%i3, 0x40, %i3
   2443 	add	%i0, 0x40, %i0
   2444 	bgu,pt	%ncc, 5b
   2445 	prefetch [%l0+0x80], #one_read
   2446 	ba	.co_blkdone
   2447 	membar	#Sync
   2448 
   2449 co_off55:
   2450 	ldd	[%l0+0x30], %d12
   2451 	ldd	[%l0+0x38], %d14
   2452 	prefetch [%l0+0x40], #one_read
   2453 	prefetch [%l0+0x80], #one_read
   2454 6:
   2455 	add	%l0, 0x40, %l0
   2456 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2457 
   2458 	ldda	[%l0]ASI_BLK_P, %d16
   2459 	ALIGN_OFF_48_55
   2460 	fsrc1	%d28, %d12
   2461 	fsrc1	%d30, %d14
   2462 
   2463 	stda	%d48, [%i0]ASI_BLK_AIUS
   2464 	subcc	%i3, 0x40, %i3
   2465 	add	%i0, 0x40, %i0
   2466 	bgu,pt	%ncc, 6b
   2467 	prefetch [%l0+0x80], #one_read
   2468 	ba	.co_blkdone
   2469 	membar	#Sync
   2470 
   2471 .co_blkcpy:
   2472 	prefetch [%i1+0x40], #one_read
   2473 	prefetch [%i1+0x80], #one_read
   2474 8:
   2475 	stxa	%g0, [%i0]%asi		! initialize the cache line
   2476 	ldda	[%i1]ASI_BLK_P, %d0
   2477 	stda	%d0, [%i0]ASI_BLK_AIUS
   2478 
   2479 	add	%i1, 0x40, %i1
   2480 	subcc	%i3, 0x40, %i3
   2481 	add	%i0, 0x40, %i0
   2482 	bgu,pt	%ncc, 8b
   2483 	prefetch [%i1+0x80], #one_read
   2484 	membar	#Sync
   2485 
   2486 .co_blkdone:
   2487 #else	/* NIAGARA_IMPL */
   2488 	andcc	%i1, 0xf, %o2		! is src quadword aligned
   2489 	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
   2490 	nop
   2491 	cmp	%o2, 0x8
   2492 	bg	.co_upper_double
   2493 	nop
   2494 	bl	.co_lower_double
   2495 	nop
   2496 
   2497 	! Falls through when source offset is equal to 8 i.e.
   2498 	! source is double word aligned.
   2499 	! In this case no shift/merge of data is required
   2500 
   2501 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
   2502 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
   2503 	prefetch [%l0+0x0], #one_read
   2504 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2505 .co_loop0:
   2506 	add	%i1, 0x10, %i1
   2507 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
   2508 	prefetch [%l0+0x40], #one_read
   2509 
   2510 	stxa	%l3, [%i0+0x0]%asi
   2511 	stxa	%l4, [%i0+0x8]%asi
   2512 
   2513 	add	%i1, 0x10, %i1
   2514 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2515 
   2516 	stxa	%l5, [%i0+0x10]%asi
   2517 	stxa	%l2, [%i0+0x18]%asi
   2518 
   2519 	add	%i1, 0x10, %i1
   2520 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
   2521 
   2522 	stxa	%l3, [%i0+0x20]%asi
   2523 	stxa	%l4, [%i0+0x28]%asi
   2524 
   2525 	add	%i1, 0x10, %i1
   2526 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2527 
   2528 	stxa	%l5, [%i0+0x30]%asi
   2529 	stxa	%l2, [%i0+0x38]%asi
   2530 
   2531 	add	%l0, 0x40, %l0
   2532 	subcc	%i3, 0x40, %i3
   2533 	bgu,pt	%xcc, .co_loop0
   2534 	add	%i0, 0x40, %i0
   2535 	ba	.co_blkdone
   2536 	add	%i1, %o2, %i1		! increment the source by src offset
   2537 					! the src offset was stored in %o2
   2538 
   2539 .co_lower_double:
   2540 
   2541 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
   2542 	sll	%o2, 3, %o0		! %o0 left shift
   2543 	mov	0x40, %o1
   2544 	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
   2545 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
   2546 	prefetch [%l0+0x0], #one_read
   2547 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
   2548 					! complete data
   2549 .co_loop1:
   2550 	add	%i1, 0x10, %i1
   2551 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
   2552 							! for this read.
   2553 	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
   2554 							! into %l2 and %l3
   2555 	prefetch [%l0+0x40], #one_read
   2556 
   2557 	stxa	%l2, [%i0+0x0]%asi
   2558 	stxa	%l3, [%i0+0x8]%asi
   2559 
   2560 	add	%i1, 0x10, %i1
   2561 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2562 	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
   2563 							! %l4 from previous read
   2564 							! into %l4 and %l5
   2565 	stxa	%l4, [%i0+0x10]%asi
   2566 	stxa	%l5, [%i0+0x18]%asi
   2567 
   2568 	! Repeat the same for next 32 bytes.
   2569 
   2570 	add	%i1, 0x10, %i1
   2571 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
   2572 	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
   2573 
   2574 	stxa	%l2, [%i0+0x20]%asi
   2575 	stxa	%l3, [%i0+0x28]%asi
   2576 
   2577 	add	%i1, 0x10, %i1
   2578 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2579 	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
   2580 
   2581 	stxa	%l4, [%i0+0x30]%asi
   2582 	stxa	%l5, [%i0+0x38]%asi
   2583 
   2584 	add	%l0, 0x40, %l0
   2585 	subcc	%i3, 0x40, %i3
   2586 	bgu,pt	%xcc, .co_loop1
   2587 	add	%i0, 0x40, %i0
   2588 	ba	.co_blkdone
   2589 	add	%i1, %o2, %i1		! increment the source by src offset
   2590 					! the src offset was stored in %o2
   2591 
   2592 .co_upper_double:
   2593 
   2594 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
   2595 	sub	%o2, 0x8, %o0
   2596 	sll	%o0, 3, %o0		! %o0 left shift
   2597 	mov	0x40, %o1
   2598 	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
   2599 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
   2600 	prefetch [%l0+0x0], #one_read
   2601 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
   2602 							! for this read and
   2603 							! no data in %l2
   2604 .co_loop2:
   2605 	add	%i1, 0x10, %i1
   2606 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
   2607 							! and %l5 has partial
   2608 	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
   2609 							! into %l3 and %l4
   2610 	prefetch [%l0+0x40], #one_read
   2611 
   2612 	stxa	%l3, [%i0+0x0]%asi
   2613 	stxa	%l4, [%i0+0x8]%asi
   2614 
   2615 	add	%i1, 0x10, %i1
   2616 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2617 	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
   2618 							! %l5 from previous read
   2619 							! into %l5 and %l2
   2620 
   2621 	stxa	%l5, [%i0+0x10]%asi
   2622 	stxa	%l2, [%i0+0x18]%asi
   2623 
   2624 	! Repeat the same for next 32 bytes.
   2625 
   2626 	add	%i1, 0x10, %i1
   2627 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
   2628 	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
   2629 
   2630 	stxa	%l3, [%i0+0x20]%asi
   2631 	stxa	%l4, [%i0+0x28]%asi
   2632 
   2633 	add	%i1, 0x10, %i1
   2634 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2635 	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
   2636 
   2637 	stxa	%l5, [%i0+0x30]%asi
   2638 	stxa	%l2, [%i0+0x38]%asi
   2639 
   2640 	add	%l0, 0x40, %l0
   2641 	subcc	%i3, 0x40, %i3
   2642 	bgu,pt	%xcc, .co_loop2
   2643 	add	%i0, 0x40, %i0
   2644 	ba	.co_blkdone
   2645 	add	%i1, %o2, %i1		! increment the source by src offset
   2646 					! the src offset was stored in %o2
   2647 
   2648 
   2649 	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
   2650 .co_blkcpy:
   2651 
   2652 	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
   2653 	prefetch [%o0+0x0], #one_read
   2654 1:
   2655 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
   2656 	add	%i1, 0x10, %i1
   2657 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
   2658 	add	%i1, 0x10, %i1
   2659 
   2660 	prefetch [%o0+0x40], #one_read
   2661 
   2662 	stxa	%l0, [%i0+0x0]%asi
   2663 
   2664 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
   2665 	add	%i1, 0x10, %i1
   2666 	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
   2667 	add	%i1, 0x10, %i1
   2668 
   2669 	stxa	%l1, [%i0+0x8]%asi
   2670 	stxa	%l2, [%i0+0x10]%asi
   2671 	stxa	%l3, [%i0+0x18]%asi
   2672 	stxa	%l4, [%i0+0x20]%asi
   2673 	stxa	%l5, [%i0+0x28]%asi
   2674 	stxa	%l6, [%i0+0x30]%asi
   2675 	stxa	%l7, [%i0+0x38]%asi
   2676 
   2677 	add	%o0, 0x40, %o0
   2678 	subcc	%i3, 0x40, %i3
   2679 	bgu,pt	%xcc, 1b
   2680 	add	%i0, 0x40, %i0
   2681 
   2682 .co_blkdone:
   2683 	membar	#Sync
   2684 #endif	/* NIAGARA_IMPL */
   2685 
   2686 	brz,pt	%i2, .copyout_exit
   2687 	nop
   2688 
   2689 	! Handle trailing bytes
   2690 	cmp	%i2, 0x8
   2691 	blu,pt	%ncc, .co_residue
   2692 	nop
   2693 
   2694 	! Can we do some 8B ops
   2695 	or	%i1, %i0, %o2
   2696 	andcc	%o2, 0x7, %g0
   2697 	bnz	%ncc, .co_last4
   2698 	nop
   2699 
   2700 	! Do 8byte ops as long as possible
   2701 .co_last8:
   2702 	ldx	[%i1], %o2
   2703 	stxa	%o2, [%i0]ASI_USER
   2704 	add	%i1, 0x8, %i1
   2705 	sub	%i2, 0x8, %i2
   2706 	cmp	%i2, 0x8
   2707 	bgu,pt	%ncc, .co_last8
   2708 	add	%i0, 0x8, %i0
   2709 
   2710 	brz,pt	%i2, .copyout_exit
   2711 	nop
   2712 
   2713 	ba	.co_residue
   2714 	nop
   2715 
   2716 .co_last4:
   2717 	! Can we do 4B ops
   2718 	andcc	%o2, 0x3, %g0
   2719 	bnz	%ncc, .co_last2
   2720 	nop
   2721 1:
   2722 	ld	[%i1], %o2
   2723 	sta	%o2, [%i0]ASI_USER
   2724 	add	%i1, 0x4, %i1
   2725 	sub	%i2, 0x4, %i2
   2726 	cmp	%i2, 0x4
   2727 	bgu,pt	%ncc, 1b
   2728 	add	%i0, 0x4, %i0
   2729 
   2730 	brz,pt	%i2, .copyout_exit
   2731 	nop
   2732 
   2733 	ba	.co_residue
   2734 	nop
   2735 
   2736 .co_last2:
   2737 	! Can we do 2B ops
   2738 	andcc	%o2, 0x1, %g0
   2739 	bnz	%ncc, .co_residue
   2740 	nop
   2741 
   2742 1:
   2743 	lduh	[%i1], %o2
   2744 	stuha	%o2, [%i0]ASI_USER
   2745 	add	%i1, 0x2, %i1
   2746 	sub	%i2, 0x2, %i2
   2747 	cmp	%i2, 0x2
   2748 	bgu,pt	%ncc, 1b
   2749 	add	%i0, 0x2, %i0
   2750 
   2751 	brz,pt	%i2, .copyout_exit
   2752 	nop
   2753 
   2754 	! Copy the residue as byte copy
   2755 .co_residue:
   2756 	ldub	[%i1], %i4
   2757 	stba	%i4, [%i0]ASI_USER
   2758 	inc	%i1
   2759 	deccc	%i2
   2760 	bgu,pt	%xcc, .co_residue
   2761 	inc	%i0
   2762 
   2763 .copyout_exit:
   2764 #if !defined(NIAGARA_IMPL)
   2765 	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
   2766 	wr	%o2, 0, %gsr		! restore gsr
   2767 
   2768 	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
   2769 	btst	FPRS_FEF, %o3
   2770 	bz	%icc, 4f
   2771 	  nop
   2772 
   2773 	! restore fpregs from stack
   2774 	BLD_FP_FROMSTACK(%o2)
   2775 
   2776 	ba,pt	%ncc, 2f
   2777 	  wr	%o3, 0, %fprs		! restore fprs
   2778 
   2779 4:
   2780 	FZERO				! zero all of the fpregs
   2781 	wr	%o3, 0, %fprs		! restore fprs
   2782 
   2783 2:
   2784 	membar	#Sync
   2785 	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
   2786 #else	/* NIAGARA_IMPL */
   2787 	membar	#Sync
   2788 #endif	/* NIAGARA_IMPL */
   2789 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   2790 	ret
   2791 	restore	%g0, 0, %o0
   2792 
   2793 .copyout_err:
   2794 	ldn	[THREAD_REG + T_COPYOPS], %o4
   2795 	brz	%o4, 2f
   2796 	nop
   2797 	ldn	[%o4 + CP_COPYOUT], %g2
   2798 	jmp	%g2
   2799 	nop
   2800 2:
   2801 	retl
   2802 	mov	-1, %o0
   2803 	SET_SIZE(copyout)
   2804 
   2805 #endif	/* lint */
   2806 
   2807 
   2808 #ifdef	lint
   2809 
   2810 /*ARGSUSED*/
   2811 int
   2812 xcopyout(const void *kaddr, void *uaddr, size_t count)
   2813 { return (0); }
   2814 
   2815 #else	/* lint */
   2816 
   2817 	ENTRY(xcopyout)
   2818 	sethi	%hi(.xcopyout_err), REAL_LOFAULT
   2819 	b	.do_copyout
   2820 	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
   2821 .xcopyout_err:
   2822 	ldn	[THREAD_REG + T_COPYOPS], %o4
   2823 	brz	%o4, 2f
   2824 	nop
   2825 	ldn	[%o4 + CP_XCOPYOUT], %g2
   2826 	jmp	%g2
   2827 	nop
   2828 2:
   2829 	retl
   2830 	mov	%g1, %o0
   2831 	SET_SIZE(xcopyout)
   2832 
   2833 #endif	/* lint */
   2834 
   2835 #ifdef	lint
   2836 
   2837 /*ARGSUSED*/
   2838 int
   2839 xcopyout_little(const void *kaddr, void *uaddr, size_t count)
   2840 { return (0); }
   2841 
   2842 #else	/* lint */
   2843 
   2844 	ENTRY(xcopyout_little)
   2845 	sethi	%hi(.little_err), %o4
   2846 	ldn	[THREAD_REG + T_LOFAULT], %o5
   2847 	or	%o4, %lo(.little_err), %o4
   2848 	membar	#Sync			! sync error barrier
   2849 	stn	%o4, [THREAD_REG + T_LOFAULT]
   2850 
   2851 	subcc	%g0, %o2, %o3
   2852 	add	%o0, %o2, %o0
   2853 	bz,pn	%ncc, 2f		! check for zero bytes
   2854 	sub	%o2, 1, %o4
   2855 	add	%o0, %o4, %o0		! start w/last byte
   2856 	add	%o1, %o2, %o1
   2857 	ldub	[%o0+%o3], %o4
   2858 
   2859 1:	stba	%o4, [%o1+%o3]ASI_AIUSL
   2860 	inccc	%o3
   2861 	sub	%o0, 2, %o0		! get next byte
   2862 	bcc,a,pt %ncc, 1b
   2863 	  ldub	[%o0+%o3], %o4
   2864 
   2865 2:	membar	#Sync			! sync error barrier
   2866 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   2867 	retl
   2868 	mov	%g0, %o0		! return (0)
   2869 	SET_SIZE(xcopyout_little)
   2870 
   2871 #endif	/* lint */
   2872 
   2873 /*
   2874  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
   2875  */
   2876 
   2877 #if defined(lint)
   2878 
   2879 /*ARGSUSED*/
   2880 int
   2881 copyin(const void *uaddr, void *kaddr, size_t count)
   2882 { return (0); }
   2883 
   2884 #else	/* lint */
   2885 
   2886 	ENTRY(copyin)
   2887 	sethi	%hi(.copyin_err), REAL_LOFAULT
   2888 	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
   2889 
   2890 .do_copyin:
   2891 	!
   2892 	! Check the length and bail if zero.
   2893 	!
   2894 	tst	%o2
   2895 	bnz,pt	%ncc, 1f
   2896 	  nop
   2897 	retl
   2898 	  clr	%o0
   2899 1:
   2900 	sethi	%hi(copyio_fault), %o4
   2901 	or	%o4, %lo(copyio_fault), %o4
   2902 	sethi	%hi(copyio_fault_nowindow), %o3
   2903 	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
   2904 	or	%o3, %lo(copyio_fault_nowindow), %o3
   2905 	membar	#Sync
   2906 	stn	%o3, [THREAD_REG + T_LOFAULT]
   2907 
   2908 	mov	%o0, SAVE_SRC
   2909 	mov	%o1, SAVE_DST
   2910 	mov	%o2, SAVE_COUNT
   2911 
   2912 	!
   2913 	! Check to see if we're more than SMALL_LIMIT.
   2914 	!
   2915 	subcc	%o2, SMALL_LIMIT, %o3
   2916 	bgu,a,pt %ncc, .dci_ns
   2917 	or	%o0, %o1, %o3
   2918 	!
   2919 	! What was previously ".small_copyin"
   2920 	!
   2921 .dcibcp:
   2922 	sub	%g0, %o2, %o3		! setup for copy loop
   2923 	add	%o0, %o2, %o0
   2924 	add	%o1, %o2, %o1
   2925 	ba,pt	%ncc, .dcicl
   2926 	lduba	[%o0 + %o3]ASI_USER, %o4
   2927 	!
   2928 	! %o0 and %o1 point at the end and remain pointing at the end
   2929 	! of their buffers. We pull things out by adding %o3 (which is
   2930 	! the negation of the length) to the buffer end which gives us
   2931 	! the curent location in the buffers. By incrementing %o3 we walk
   2932 	! through both buffers without having to bump each buffer's
   2933 	! pointer. A very fast 4 instruction loop.
   2934 	!
   2935 	.align 16
   2936 .dcicl:
   2937 	stb	%o4, [%o1 + %o3]
   2938 	inccc	%o3
   2939 	bl,a,pt %ncc, .dcicl
   2940 	lduba	[%o0 + %o3]ASI_USER, %o4
   2941 	!
   2942 	! We're done. Go home.
   2943 	!
   2944 	membar	#Sync
   2945 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
   2946 	retl
   2947 	clr	%o0
   2948 	!
   2949 	! Try aligned copies from here.
   2950 	!
   2951 .dci_ns:
   2952 	!
   2953 	! See if we're single byte aligned. If we are, check the
   2954 	! limit for single byte copies. If we're smaller, or equal,
   2955 	! bounce to the byte for byte copy loop. Otherwise do it in
   2956 	! HW (if enabled).
   2957 	!
   2958 	btst	1, %o3
   2959 	bz,a,pt	%icc, .dcih8
   2960 	btst	7, %o3
   2961 	!
   2962 	! We're single byte aligned.
   2963 	!
   2964 	sethi	%hi(hw_copy_limit_1), %o3
   2965 	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
   2966 	!
   2967 	! Is HW copy on? If not do everything byte for byte.
   2968 	!
   2969 	tst	%o3
   2970 	bz,pn	%icc, .dcibcp
   2971 	subcc	%o3, %o2, %o3
   2972 	!
   2973 	! Are we bigger than the HW limit? If not
   2974 	! go to byte for byte.
   2975 	!
   2976 	bge,pt	%ncc, .dcibcp
   2977 	nop
   2978 	!
   2979 	! We're big enough and copy is on. Do it with HW.
   2980 	!
   2981 	ba,pt	%ncc, .big_copyin
   2982 	nop
   2983 .dcih8:
   2984 	!
   2985 	! 8 byte aligned?
   2986 	!
   2987 	bnz,a	%ncc, .dcih4
   2988 	btst	3, %o3
   2989 	!
   2990 	! We're eight byte aligned.
   2991 	!
   2992 	sethi	%hi(hw_copy_limit_8), %o3
   2993 	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
   2994 	!
   2995 	! Is HW assist on? If not, do it with the aligned copy.
   2996 	!
   2997 	tst	%o3
   2998 	bz,pn	%icc, .dcis8
   2999 	subcc	%o3, %o2, %o3
   3000 	bge	%ncc, .dcis8
   3001 	nop
   3002 	ba,pt	%ncc, .big_copyin
   3003 	nop
   3004 .dcis8:
   3005 	!
   3006 	! Housekeeping for copy loops. Uses same idea as in the byte for
   3007 	! byte copy loop above.
   3008 	!
   3009 	add	%o0, %o2, %o0
   3010 	add	%o1, %o2, %o1
   3011 	sub	%g0, %o2, %o3
   3012 	ba,pt	%ncc, .didebc
   3013 	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
   3014 	!
   3015 	! 4 byte aligned?
   3016 	!
   3017 .dcih4:
   3018 	bnz	%ncc, .dcih2
   3019 	sethi	%hi(hw_copy_limit_4), %o3
   3020 	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
   3021 	!
   3022 	! Is HW assist on? If not, do it with the aligned copy.
   3023 	!
   3024 	tst	%o3
   3025 	bz,pn	%icc, .dcis4
   3026 	subcc	%o3, %o2, %o3
   3027 	!
   3028 	! We're negative if our size is less than or equal to hw_copy_limit_4.
   3029 	!
   3030 	bge	%ncc, .dcis4
   3031 	nop
   3032 	ba,pt	%ncc, .big_copyin
   3033 	nop
   3034 .dcis4:
   3035 	!
   3036 	! Housekeeping for copy loops. Uses same idea as in the byte
   3037 	! for byte copy loop above.
   3038 	!
   3039 	add	%o0, %o2, %o0
   3040 	add	%o1, %o2, %o1
   3041 	sub	%g0, %o2, %o3
   3042 	ba,pt	%ncc, .didfbc
   3043 	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
   3044 .dcih2:
   3045 	!
   3046 	! We're two byte aligned. Check for "smallness"
   3047 	! done in delay at .dcih4
   3048 	!
   3049 	bleu,pt	%ncc, .dcis2
   3050 	sethi	%hi(hw_copy_limit_2), %o3
   3051 	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
   3052 	!
   3053 	! Is HW assist on? If not, do it with the aligned copy.
   3054 	!
   3055 	tst	%o3
   3056 	bz,pn	%icc, .dcis2
   3057 	subcc	%o3, %o2, %o3
   3058 	!
   3059 	! Are we larger than the HW limit?
   3060 	!
   3061 	bge	%ncc, .dcis2
   3062 	nop
   3063 	!
   3064 	! HW assist is on and we're large enough to use it.
   3065 	!
   3066 	ba,pt	%ncc, .big_copyin
   3067 	nop
   3068 	!
   3069 	! Housekeeping for copy loops. Uses same idea as in the byte
   3070 	! for byte copy loop above.
   3071 	!
   3072 .dcis2:
   3073 	add	%o0, %o2, %o0
   3074 	add	%o1, %o2, %o1
   3075 	sub	%g0, %o2, %o3
   3076 	ba,pt	%ncc, .didtbc
   3077 	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
   3078 	!
   3079 .small_copyin:
   3080 	!
   3081 	! Why are we doing this AGAIN? There are certain conditions in
   3082 	! big copyin that will cause us to forgo the HW assisted copys
   3083 	! and bounce back to a non-hw assisted copy. This dispatches
   3084 	! those copies. Note that we branch around this in the main line
   3085 	! code.
   3086 	!
   3087 	! We make no check for limits or HW enablement here. We've
   3088 	! already been told that we're a poster child so just go off
   3089 	! and do it.
   3090 	!
   3091 	or	%o0, %o1, %o3
   3092 	btst	1, %o3
   3093 	bnz	%icc, .dcibcp		! Most likely
   3094 	btst	7, %o3
   3095 	bz	%icc, .dcis8
   3096 	btst	3, %o3
   3097 	bz	%icc, .dcis4
   3098 	nop
   3099 	ba,pt	%ncc, .dcis2
   3100 	nop
   3101 	!
   3102 	! Eight byte aligned copies. A steal from the original .small_copyin
   3103 	! with modifications. %o2 is number of 8 byte chunks to copy. When
   3104 	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
   3105 	! to copy.
   3106 	!
   3107 	.align 32
   3108 .didebc:
   3109 	ldxa	[%o0 + %o3]ASI_USER, %o4
   3110 	deccc	%o2
   3111 	stx	%o4, [%o1 + %o3]
   3112 	bg,pt	%ncc, .didebc
   3113 	addcc	%o3, 8, %o3
   3114 	!
   3115 	! End of copy loop. Most 8 byte aligned copies end here.
   3116 	!
   3117 	bz,pt	%ncc, .dcifh
   3118 	nop
   3119 	!
   3120 	! Something is left. Do it byte for byte.
   3121 	!
   3122 	ba,pt	%ncc, .dcicl
   3123 	lduba	[%o0 + %o3]ASI_USER, %o4
   3124 	!
   3125 	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
   3126 	!
   3127 	.align 32
   3128 .didfbc:
   3129 	lduwa	[%o0 + %o3]ASI_USER, %o4
   3130 	deccc	%o2
   3131 	st	%o4, [%o1 + %o3]
   3132 	bg,pt	%ncc, .didfbc
   3133 	addcc	%o3, 4, %o3
   3134 	!
   3135 	! End of copy loop. Most 4 byte aligned copies end here.
   3136 	!
   3137 	bz,pt	%ncc, .dcifh
   3138 	nop
   3139 	!
   3140 	! Something is left. Do it byte for byte.
   3141 	!
   3142 	ba,pt	%ncc, .dcicl
   3143 	lduba	[%o0 + %o3]ASI_USER, %o4
   3144 	!
   3145 	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
   3146 	! copy.
   3147 	!
   3148 	.align 32
   3149 .didtbc:
   3150 	lduha	[%o0 + %o3]ASI_USER, %o4
   3151 	deccc	%o2
   3152 	sth	%o4, [%o1 + %o3]
   3153 	bg,pt	%ncc, .didtbc
   3154 	addcc	%o3, 2, %o3
   3155 	!
   3156 	! End of copy loop. Most 2 byte aligned copies end here.
   3157 	!
   3158 	bz,pt	%ncc, .dcifh
   3159 	nop
   3160 	!
   3161 	! Deal with the last byte
   3162 	!
   3163 	lduba	[%o0 + %o3]ASI_USER, %o4
   3164 	stb	%o4, [%o1 + %o3]
   3165 .dcifh:
   3166 	membar	#Sync
   3167 	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
   3168 	retl
   3169 	clr	%o0
   3170 
   3171 .big_copyin:
   3172 	! We're going off to do a block copy.
   3173 	! Switch fault hendlers and grab a window. We
   3174 	! don't do a membar #Sync since we've done only
   3175 	! kernel data to this point.
   3176 	stn	%o4, [THREAD_REG + T_LOFAULT]
   3177 
   3178 	! Copy in that reach here are larger than 256 bytes. The
   3179 	! hw_copy_limit_1 is set to 256. Never set this limit less
   3180 	! 128 bytes.
   3181 #if !defined(NIAGARA_IMPL)
   3182 	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
   3183 
   3184 	rd	%fprs, %o2			! check for unused fp
   3185 	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
   3186 	btst	FPRS_FEF, %o2
   3187 	bz,a,pt	%icc, .do_blockcopyin
   3188 	wr	%g0, FPRS_FEF, %fprs
   3189 
   3190 	! save in-use fpregs on stack
   3191 	BST_FP_TOSTACK(%o2)
   3192 #else	/* NIAGARA_IMPL */
   3193 	save	%sp, -SA(MINFRAME), %sp
   3194 #endif	/* NIAGARA_IMPL */
   3195 
   3196 .do_blockcopyin:
   3197 
   3198 #if !defined(NIAGARA_IMPL)
   3199 	rd	%gsr, %o2
   3200 	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
   3201 	! set the lower bit saved t_lofault to indicate that we need
   3202 	! clear %fprs register on the way out
   3203 	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
   3204 #endif	/* NIAGARA_IMPL */
   3205 
   3206 	! Swap src/dst since the code below is memcpy code
   3207 	! and memcpy/bcopy have different calling sequences
   3208 	mov	%i1, %i5
   3209 	mov	%i0, %i1
   3210 	mov	%i5, %i0
   3211 
   3212 	! Block (64 bytes) align the destination.
   3213 	andcc	%i0, 0x3f, %i3		! is dst block aligned
   3214 	bz	%ncc, copyin_blalign	! dst already block aligned
   3215 	sub	%i3, 0x40, %i3
   3216 	neg	%i3			! bytes till dst 64 bytes aligned
   3217 	sub	%i2, %i3, %i2		! update i2 with new count
   3218 
   3219 	! Based on source and destination alignment do
   3220 	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
   3221 
   3222 	! Is dst & src 8B aligned
   3223 	or	%i0, %i1, %o2
   3224 	andcc	%o2, 0x7, %g0
   3225 	bz	%ncc, .ci_alewdcp
   3226 	nop
   3227 
   3228 	! Is dst & src 4B aligned
   3229 	andcc	%o2, 0x3, %g0
   3230 	bz	%ncc, .ci_alwdcp
   3231 	nop
   3232 
   3233 	! Is dst & src 2B aligned
   3234 	andcc	%o2, 0x1, %g0
   3235 	bz	%ncc, .ci_alhlfwdcp
   3236 	nop
   3237 
   3238 	! 1B aligned
   3239 1:	lduba	[%i1]ASI_USER, %o2
   3240 	stb	%o2, [%i0]
   3241 	inc	%i1
   3242 	deccc	%i3
   3243 	bgu,pt	%ncc, 1b
   3244 	inc	%i0
   3245 
   3246 	ba	copyin_blalign
   3247 	nop
   3248 
   3249 	! dst & src 4B aligned
   3250 .ci_alwdcp:
   3251 	lda	[%i1]ASI_USER, %o2
   3252 	st	%o2, [%i0]
   3253 	add	%i1, 0x4, %i1
   3254 	subcc	%i3, 0x4, %i3
   3255 	bgu,pt	%ncc, .ci_alwdcp
   3256 	add	%i0, 0x4, %i0
   3257 
   3258 	ba	copyin_blalign
   3259 	nop
   3260 
   3261 	! dst & src 2B aligned
   3262 .ci_alhlfwdcp:
   3263 	lduha	[%i1]ASI_USER, %o2
   3264 	stuh	%o2, [%i0]
   3265 	add	%i1, 0x2, %i1
   3266 	subcc	%i3, 0x2, %i3
   3267 	bgu,pt	%ncc, .ci_alhlfwdcp
   3268 	add	%i0, 0x2, %i0
   3269 
   3270 	ba	copyin_blalign
   3271 	nop
   3272 
   3273 	! dst & src 8B aligned
   3274 .ci_alewdcp:
   3275 	ldxa	[%i1]ASI_USER, %o2
   3276 	stx	%o2, [%i0]
   3277 	add	%i1, 0x8, %i1
   3278 	subcc	%i3, 0x8, %i3
   3279 	bgu,pt	%ncc, .ci_alewdcp
   3280 	add	%i0, 0x8, %i0
   3281 
   3282 copyin_blalign:
   3283 	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
   3284 	sub	%i2, %i3, %i2		! Residue bytes in %i2
   3285 
   3286 #if !defined(NIAGARA_IMPL)
   3287 	mov	ASI_USER, %asi
   3288 
   3289 	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
   3290 	prefetcha [%l0+0x0]%asi, #one_read
   3291 	andcc	%i1, 0x3f, %g0		! is src 64B aligned
   3292 	bz,pn	%ncc, .ci_blkcpy
   3293 	nop
   3294 
   3295 	! handle misaligned source cases
   3296 	alignaddr %i1, %g0, %g0		! generate %gsr
   3297 
   3298 	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
   3299 					! significant in %l1
   3300 	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
   3301 	add	%i1, %i3, %i1
   3302 
   3303 	! switch statement to get to right 8 byte block within
   3304 	! 64 byte block
   3305 	cmp	 %l2, 0x4
   3306 	bgeu,a	 ci_hlf
   3307 	cmp	 %l2, 0x6
   3308 	cmp	 %l2, 0x2
   3309 	bgeu,a	 ci_sqtr
   3310 	nop
   3311 	cmp	 %l2, 0x1
   3312 	be,a	 ci_off15
   3313 	nop
   3314 	ba	 ci_off7
   3315 	nop
   3316 ci_sqtr:
   3317 	be,a	 ci_off23
   3318 	nop
   3319 	ba,a	 ci_off31
   3320 	nop
   3321 
   3322 ci_hlf:
   3323 	bgeu,a	 ci_fqtr
   3324 	nop
   3325 	cmp	 %l2, 0x5
   3326 	be,a	 ci_off47
   3327 	nop
   3328 	ba	 ci_off39
   3329 	nop
   3330 ci_fqtr:
   3331 	be,a	 ci_off55
   3332 	nop
   3333 
   3334 	ldda	[%l0+0x38]%asi, %d14
   3335 	prefetcha [%l0+0x40]%asi, #one_read
   3336 	prefetcha [%l0+0x80]%asi, #one_read
   3337 7:
   3338 	add	%l0, 0x40, %l0
   3339 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3340 
   3341 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3342 	ALIGN_OFF_56_63
   3343 	fsrc1	%d30, %d14
   3344 
   3345 	stda	%d48, [%i0]ASI_BLK_P
   3346 	subcc	%i3, 0x40, %i3
   3347 	add	%i0, 0x40, %i0
   3348 	bgu,pt	%ncc, 7b
   3349 	prefetcha [%l0+0x80]%asi, #one_read
   3350 	ba	.ci_blkdone
   3351 	membar	#Sync
   3352 
   3353 ci_off7:
   3354 	ldda	[%l0]ASI_BLK_AIUS, %d0
   3355 	prefetcha [%l0+0x40]%asi, #one_read
   3356 	prefetcha [%l0+0x80]%asi, #one_read
   3357 0:
   3358 	add	%l0, 0x40, %l0
   3359 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3360 
   3361 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3362 	ALIGN_OFF_1_7
   3363 	fsrc1	%d16, %d0
   3364 	fsrc1	%d18, %d2
   3365 	fsrc1	%d20, %d4
   3366 	fsrc1	%d22, %d6
   3367 	fsrc1	%d24, %d8
   3368 	fsrc1	%d26, %d10
   3369 	fsrc1	%d28, %d12
   3370 	fsrc1	%d30, %d14
   3371 
   3372 	stda	%d48, [%i0]ASI_BLK_P
   3373 	subcc	%i3, 0x40, %i3
   3374 	add	%i0, 0x40, %i0
   3375 	bgu,pt	%ncc, 0b
   3376 	prefetcha [%l0+0x80]%asi, #one_read
   3377 	ba	.ci_blkdone
   3378 	membar	#Sync
   3379 
   3380 ci_off15:
   3381 	ldda	[%l0+0x8]%asi, %d2
   3382 	ldda	[%l0+0x10]%asi, %d4
   3383 	ldda	[%l0+0x18]%asi, %d6
   3384 	ldda	[%l0+0x20]%asi, %d8
   3385 	ldda	[%l0+0x28]%asi, %d10
   3386 	ldda	[%l0+0x30]%asi, %d12
   3387 	ldda	[%l0+0x38]%asi, %d14
   3388 	prefetcha [%l0+0x40]%asi, #one_read
   3389 	prefetcha [%l0+0x80]%asi, #one_read
   3390 1:
   3391 	add	%l0, 0x40, %l0
   3392 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3393 
   3394 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3395 	ALIGN_OFF_8_15
   3396 	fsrc1	%d18, %d2
   3397 	fsrc1	%d20, %d4
   3398 	fsrc1	%d22, %d6
   3399 	fsrc1	%d24, %d8
   3400 	fsrc1	%d26, %d10
   3401 	fsrc1	%d28, %d12
   3402 	fsrc1	%d30, %d14
   3403 
   3404 	stda	%d48, [%i0]ASI_BLK_P
   3405 	subcc	%i3, 0x40, %i3
   3406 	add	%i0, 0x40, %i0
   3407 	bgu,pt	%ncc, 1b
   3408 	prefetcha [%l0+0x80]%asi, #one_read
   3409 	ba	.ci_blkdone
   3410 	membar	#Sync
   3411 
   3412 ci_off23:
   3413 	ldda	[%l0+0x10]%asi, %d4
   3414 	ldda	[%l0+0x18]%asi, %d6
   3415 	ldda	[%l0+0x20]%asi, %d8
   3416 	ldda	[%l0+0x28]%asi, %d10
   3417 	ldda	[%l0+0x30]%asi, %d12
   3418 	ldda	[%l0+0x38]%asi, %d14
   3419 	prefetcha [%l0+0x40]%asi, #one_read
   3420 	prefetcha [%l0+0x80]%asi, #one_read
   3421 2:
   3422 	add	%l0, 0x40, %l0
   3423 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3424 
   3425 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3426 	ALIGN_OFF_16_23
   3427 	fsrc1	%d20, %d4
   3428 	fsrc1	%d22, %d6
   3429 	fsrc1	%d24, %d8
   3430 	fsrc1	%d26, %d10
   3431 	fsrc1	%d28, %d12
   3432 	fsrc1	%d30, %d14
   3433 
   3434 	stda	%d48, [%i0]ASI_BLK_P
   3435 	subcc	%i3, 0x40, %i3
   3436 	add	%i0, 0x40, %i0
   3437 	bgu,pt	%ncc, 2b
   3438 	prefetcha [%l0+0x80]%asi, #one_read
   3439 	ba	.ci_blkdone
   3440 	membar	#Sync
   3441 
   3442 ci_off31:
   3443 	ldda	[%l0+0x18]%asi, %d6
   3444 	ldda	[%l0+0x20]%asi, %d8
   3445 	ldda	[%l0+0x28]%asi, %d10
   3446 	ldda	[%l0+0x30]%asi, %d12
   3447 	ldda	[%l0+0x38]%asi, %d14
   3448 	prefetcha [%l0+0x40]%asi, #one_read
   3449 	prefetcha [%l0+0x80]%asi, #one_read
   3450 3:
   3451 	add	%l0, 0x40, %l0
   3452 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3453 
   3454 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3455 	ALIGN_OFF_24_31
   3456 	fsrc1	%d22, %d6
   3457 	fsrc1	%d24, %d8
   3458 	fsrc1	%d26, %d10
   3459 	fsrc1	%d28, %d12
   3460 	fsrc1	%d30, %d14
   3461 
   3462 	stda	%d48, [%i0]ASI_BLK_P
   3463 	subcc	%i3, 0x40, %i3
   3464 	add	%i0, 0x40, %i0
   3465 	bgu,pt	%ncc, 3b
   3466 	prefetcha [%l0+0x80]%asi, #one_read
   3467 	ba	.ci_blkdone
   3468 	membar	#Sync
   3469 
   3470 ci_off39:
   3471 	ldda	[%l0+0x20]%asi, %d8
   3472 	ldda	[%l0+0x28]%asi, %d10
   3473 	ldda	[%l0+0x30]%asi, %d12
   3474 	ldda	[%l0+0x38]%asi, %d14
   3475 	prefetcha [%l0+0x40]%asi, #one_read
   3476 	prefetcha [%l0+0x80]%asi, #one_read
   3477 4:
   3478 	add	%l0, 0x40, %l0
   3479 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3480 
   3481 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3482 	ALIGN_OFF_32_39
   3483 	fsrc1	%d24, %d8
   3484 	fsrc1	%d26, %d10
   3485 	fsrc1	%d28, %d12
   3486 	fsrc1	%d30, %d14
   3487 
   3488 	stda	%d48, [%i0]ASI_BLK_P
   3489 	subcc	%i3, 0x40, %i3
   3490 	add	%i0, 0x40, %i0
   3491 	bgu,pt	%ncc, 4b
   3492 	prefetcha [%l0+0x80]%asi, #one_read
   3493 	ba	.ci_blkdone
   3494 	membar	#Sync
   3495 
   3496 ci_off47:
   3497 	ldda	[%l0+0x28]%asi, %d10
   3498 	ldda	[%l0+0x30]%asi, %d12
   3499 	ldda	[%l0+0x38]%asi, %d14
   3500 	prefetcha [%l0+0x40]%asi, #one_read
   3501 	prefetcha [%l0+0x80]%asi, #one_read
   3502 5:
   3503 	add	%l0, 0x40, %l0
   3504 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3505 
   3506 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3507 	ALIGN_OFF_40_47
   3508 	fsrc1	%d26, %d10
   3509 	fsrc1	%d28, %d12
   3510 	fsrc1	%d30, %d14
   3511 
   3512 	stda	%d48, [%i0]ASI_BLK_P
   3513 	subcc	%i3, 0x40, %i3
   3514 	add	%i0, 0x40, %i0
   3515 	bgu,pt	%ncc, 5b
   3516 	prefetcha [%l0+0x80]%asi, #one_read
   3517 	ba	.ci_blkdone
   3518 	membar	#Sync
   3519 
   3520 ci_off55:
   3521 	ldda	[%l0+0x30]%asi, %d12
   3522 	ldda	[%l0+0x38]%asi, %d14
   3523 	prefetcha [%l0+0x40]%asi, #one_read
   3524 	prefetcha [%l0+0x80]%asi, #one_read
   3525 6:
   3526 	add	%l0, 0x40, %l0
   3527 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3528 
   3529 	ldda	[%l0]ASI_BLK_AIUS, %d16
   3530 	ALIGN_OFF_48_55
   3531 	fsrc1	%d28, %d12
   3532 	fsrc1	%d30, %d14
   3533 
   3534 	stda	%d48, [%i0]ASI_BLK_P
   3535 	subcc	%i3, 0x40, %i3
   3536 	add	%i0, 0x40, %i0
   3537 	bgu,pt	%ncc, 6b
   3538 	prefetcha [%l0+0x80]%asi, #one_read
   3539 	ba	.ci_blkdone
   3540 	membar	#Sync
   3541 
   3542 .ci_blkcpy:
   3543 	prefetcha [%i1+0x40]%asi, #one_read
   3544 	prefetcha [%i1+0x80]%asi, #one_read
   3545 8:
   3546 	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
   3547 	ldda	[%i1]ASI_BLK_AIUS, %d0
   3548 	stda	%d0, [%i0]ASI_BLK_P
   3549 
   3550 	add	%i1, 0x40, %i1
   3551 	subcc	%i3, 0x40, %i3
   3552 	add	%i0, 0x40, %i0
   3553 	bgu,pt	%ncc, 8b
   3554 	prefetcha [%i1+0x80]%asi, #one_read
   3555 	membar	#Sync
   3556 
   3557 .ci_blkdone:
   3558 #else	/* NIAGARA_IMPL */
   3559 	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
   3560 
   3561 	andcc	%i1, 0xf, %o2		! is src quadword aligned
   3562 	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
   3563 	nop
   3564 	cmp	%o2, 0x8
   3565 	bg	.ci_upper_double
   3566 	nop
   3567 	bl	.ci_lower_double
   3568 	nop
   3569 
   3570 	! Falls through when source offset is equal to 8 i.e.
   3571 	! source is double word aligned.
   3572 	! In this case no shift/merge of data is required
   3573 
   3574 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
   3575 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
   3576 	prefetcha [%l0]ASI_USER, #one_read
   3577 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3578 	add	%l0, 0x40, %l0
   3579 .ci_loop0:
   3580 	add	%i1, 0x10, %i1
   3581 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
   3582 
   3583 	prefetcha [%l0]ASI_USER, #one_read
   3584 
   3585 	stxa	%l3, [%i0+0x0]%asi
   3586 	stxa	%l4, [%i0+0x8]%asi
   3587 
   3588 	add	%i1, 0x10, %i1
   3589 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3590 
   3591 	stxa	%l5, [%i0+0x10]%asi
   3592 	stxa	%l2, [%i0+0x18]%asi
   3593 
   3594 	add	%i1, 0x10, %i1
   3595 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
   3596 
   3597 	stxa	%l3, [%i0+0x20]%asi
   3598 	stxa	%l4, [%i0+0x28]%asi
   3599 
   3600 	add	%i1, 0x10, %i1
   3601 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3602 
   3603 	stxa	%l5, [%i0+0x30]%asi
   3604 	stxa	%l2, [%i0+0x38]%asi
   3605 
   3606 	add	%l0, 0x40, %l0
   3607 	subcc	%i3, 0x40, %i3
   3608 	bgu,pt	%xcc, .ci_loop0
   3609 	add	%i0, 0x40, %i0
   3610 	ba	.ci_blkdone
   3611 	add	%i1, %o2, %i1		! increment the source by src offset
   3612 					! the src offset was stored in %o2
   3613 
   3614 .ci_lower_double:
   3615 
   3616 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
   3617 	sll	%o2, 3, %o0		! %o0 left shift
   3618 	mov	0x40, %o1
   3619 	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
   3620 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
   3621 	prefetcha [%l0]ASI_USER, #one_read
   3622 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
   3623 							! and %l3 has complete
   3624 							! data
   3625 	add	%l0, 0x40, %l0
   3626 .ci_loop1:
   3627 	add	%i1, 0x10, %i1
   3628 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
   3629 							! for this read.
   3630 	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
   3631 							! into %l2 and %l3
   3632 
   3633 	prefetcha [%l0]ASI_USER, #one_read
   3634 
   3635 	stxa	%l2, [%i0+0x0]%asi
   3636 	stxa	%l3, [%i0+0x8]%asi
   3637 
   3638 	add	%i1, 0x10, %i1
   3639 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3640 	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
   3641 							! %l4 from previous read
   3642 							! into %l4 and %l5
   3643 	stxa	%l4, [%i0+0x10]%asi
   3644 	stxa	%l5, [%i0+0x18]%asi
   3645 
   3646 	! Repeat the same for next 32 bytes.
   3647 
   3648 	add	%i1, 0x10, %i1
   3649 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
   3650 	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
   3651 
   3652 	stxa	%l2, [%i0+0x20]%asi
   3653 	stxa	%l3, [%i0+0x28]%asi
   3654 
   3655 	add	%i1, 0x10, %i1
   3656 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3657 	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
   3658 
   3659 	stxa	%l4, [%i0+0x30]%asi
   3660 	stxa	%l5, [%i0+0x38]%asi
   3661 
   3662 	add	%l0, 0x40, %l0
   3663 	subcc	%i3, 0x40, %i3
   3664 	bgu,pt	%xcc, .ci_loop1
   3665 	add	%i0, 0x40, %i0
   3666 	ba	.ci_blkdone
   3667 	add	%i1, %o2, %i1		! increment the source by src offset
   3668 					! the src offset was stored in %o2
   3669 
   3670 .ci_upper_double:
   3671 
   3672 	sub	%i1, %o2, %i1		! align the src at 16 bytes.
   3673 	sub	%o2, 0x8, %o0
   3674 	sll	%o0, 3, %o0		! %o0 left shift
   3675 	mov	0x40, %o1
   3676 	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
   3677 	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
   3678 	prefetcha [%l0]ASI_USER, #one_read
   3679 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
   3680 							! for this read and
   3681 							! no data in %l2
   3682 	add	%l0, 0x40, %l0
   3683 .ci_loop2:
   3684 	add	%i1, 0x10, %i1
   3685 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
   3686 							! and %l5 has partial
   3687 	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
   3688 							! into %l3 and %l4
   3689 	prefetcha [%l0]ASI_USER, #one_read
   3690 
   3691 	stxa	%l3, [%i0+0x0]%asi
   3692 	stxa	%l4, [%i0+0x8]%asi
   3693 
   3694 	add	%i1, 0x10, %i1
   3695 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3696 	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
   3697 							! %l5 from previous read
   3698 							! into %l5 and %l2
   3699 
   3700 	stxa	%l5, [%i0+0x10]%asi
   3701 	stxa	%l2, [%i0+0x18]%asi
   3702 
   3703 	! Repeat the same for next 32 bytes.
   3704 
   3705 	add	%i1, 0x10, %i1
   3706 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
   3707 	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
   3708 
   3709 	stxa	%l3, [%i0+0x20]%asi
   3710 	stxa	%l4, [%i0+0x28]%asi
   3711 
   3712 	add	%i1, 0x10, %i1
   3713 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3714 	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
   3715 
   3716 	stxa	%l5, [%i0+0x30]%asi
   3717 	stxa	%l2, [%i0+0x38]%asi
   3718 
   3719 	add	%l0, 0x40, %l0
   3720 	subcc	%i3, 0x40, %i3
   3721 	bgu,pt	%xcc, .ci_loop2
   3722 	add	%i0, 0x40, %i0
   3723 	ba	.ci_blkdone
   3724 	add	%i1, %o2, %i1		! increment the source by src offset
   3725 					! the src offset was stored in %o2
   3726 
   3727 
   3728 	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
   3729 .ci_blkcpy:
   3730 
   3731 	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
   3732 	prefetcha [%o0]ASI_USER, #one_read
   3733 	add	%o0, 0x40, %o0
   3734 1:
   3735 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
   3736 	add	%i1, 0x10, %i1
   3737 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
   3738 	add	%i1, 0x10, %i1
   3739 
   3740 	prefetcha [%o0]ASI_USER, #one_read
   3741 
   3742 	stxa	%l0, [%i0+0x0]%asi
   3743 
   3744 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
   3745 	add	%i1, 0x10, %i1
   3746 	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
   3747 	add	%i1, 0x10, %i1
   3748 
   3749 	stxa	%l1, [%i0+0x8]%asi
   3750 	stxa	%l2, [%i0+0x10]%asi
   3751 	stxa	%l3, [%i0+0x18]%asi
   3752 	stxa	%l4, [%i0+0x20]%asi
   3753 	stxa	%l5, [%i0+0x28]%asi
   3754 	stxa	%l6, [%i0+0x30]%asi
   3755 	stxa	%l7, [%i0+0x38]%asi
   3756 
   3757 	add	%o0, 0x40, %o0
   3758 	subcc	%i3, 0x40, %i3
   3759 	bgu,pt	%xcc, 1b
   3760 	add	%i0, 0x40, %i0
   3761 
   3762 .ci_blkdone:
   3763 	membar	#Sync
   3764 #endif	/* NIAGARA_IMPL */
   3765 
   3766 	brz,pt	%i2, .copyin_exit
   3767 	nop
   3768 
   3769 	! Handle trailing bytes
   3770 	cmp	%i2, 0x8
   3771 	blu,pt	%ncc, .ci_residue
   3772 	nop
   3773 
   3774 	! Can we do some 8B ops
   3775 	or	%i1, %i0, %o2
   3776 	andcc	%o2, 0x7, %g0
   3777 	bnz	%ncc, .ci_last4
   3778 	nop
   3779 
   3780 	! Do 8byte ops as long as possible
   3781 .ci_last8:
   3782 	ldxa	[%i1]ASI_USER, %o2
   3783 	stx	%o2, [%i0]
   3784 	add	%i1, 0x8, %i1
   3785 	sub	%i2, 0x8, %i2
   3786 	cmp	%i2, 0x8
   3787 	bgu,pt	%ncc, .ci_last8
   3788 	add	%i0, 0x8, %i0
   3789 
   3790 	brz,pt	%i2, .copyin_exit
   3791 	nop
   3792 
   3793 	ba	.ci_residue
   3794 	nop
   3795 
   3796 .ci_last4:
   3797 	! Can we do 4B ops
   3798 	andcc	%o2, 0x3, %g0
   3799 	bnz	%ncc, .ci_last2
   3800 	nop
   3801 1:
   3802 	lda	[%i1]ASI_USER, %o2
   3803 	st	%o2, [%i0]
   3804 	add	%i1, 0x4, %i1
   3805 	sub	%i2, 0x4, %i2
   3806 	cmp	%i2, 0x4
   3807 	bgu,pt	%ncc, 1b
   3808 	add	%i0, 0x4, %i0
   3809 
   3810 	brz,pt	%i2, .copyin_exit
   3811 	nop
   3812 
   3813 	ba	.ci_residue
   3814 	nop
   3815 
   3816 .ci_last2:
   3817 	! Can we do 2B ops
   3818 	andcc	%o2, 0x1, %g0
   3819 	bnz	%ncc, .ci_residue
   3820 	nop
   3821 
   3822 1:
   3823 	lduha	[%i1]ASI_USER, %o2
   3824 	stuh	%o2, [%i0]
   3825 	add	%i1, 0x2, %i1
   3826 	sub	%i2, 0x2, %i2
   3827 	cmp	%i2, 0x2
   3828 	bgu,pt	%ncc, 1b
   3829 	add	%i0, 0x2, %i0
   3830 
   3831 	brz,pt	%i2, .copyin_exit
   3832 	nop
   3833 
   3834 	! Copy the residue as byte copy
   3835 .ci_residue:
   3836 	lduba	[%i1]ASI_USER, %i4
   3837 	stb	%i4, [%i0]
   3838 	inc	%i1
   3839 	deccc	%i2
   3840 	bgu,pt	%xcc, .ci_residue
   3841 	inc	%i0
   3842 
   3843 .copyin_exit:
   3844 #if !defined(NIAGARA_IMPL)
   3845 	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
   3846 	wr	%o2, 0, %gsr		! restore gsr
   3847 
   3848 	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
   3849 	btst	FPRS_FEF, %o3
   3850 	bz	%icc, 4f
   3851 	  nop
   3852 
   3853 	! restore fpregs from stack
   3854 	BLD_FP_FROMSTACK(%o2)
   3855 
   3856 	ba,pt	%ncc, 2f
   3857 	  wr	%o3, 0, %fprs		! restore fprs
   3858 
   3859 4:
   3860 	FZERO				! zero all of the fpregs
   3861 	wr	%o3, 0, %fprs		! restore fprs
   3862 
   3863 2:
   3864 	membar	#Sync			! sync error barrier
   3865 	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
   3866 #else	/* NIAGARA_IMPL */
   3867 	membar	#Sync
   3868 #endif	/* NIAGARA_IMPL */
   3869 	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   3870 	ret
   3871 	restore	%g0, 0, %o0
   3872 .copyin_err:
   3873 	ldn	[THREAD_REG + T_COPYOPS], %o4
   3874 	brz	%o4, 2f
   3875 	nop
   3876 	ldn	[%o4 + CP_COPYIN], %g2
   3877 	jmp	%g2
   3878 	nop
   3879 2:
   3880 	retl
   3881 	mov	-1, %o0
   3882 	SET_SIZE(copyin)
   3883 
   3884 #endif	/* lint */
   3885 
   3886 #ifdef	lint
   3887 
   3888 /*ARGSUSED*/
   3889 int
   3890 xcopyin(const void *uaddr, void *kaddr, size_t count)
   3891 { return (0); }
   3892 
   3893 #else	/* lint */
   3894 
   3895 	ENTRY(xcopyin)
   3896 	sethi	%hi(.xcopyin_err), REAL_LOFAULT
   3897 	b	.do_copyin
   3898 	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
   3899 .xcopyin_err:
   3900 	ldn	[THREAD_REG + T_COPYOPS], %o4
   3901 	brz	%o4, 2f
   3902 	nop
   3903 	ldn	[%o4 + CP_XCOPYIN], %g2
   3904 	jmp	%g2
   3905 	nop
   3906 2:
   3907 	retl
   3908 	mov	%g1, %o0
   3909 	SET_SIZE(xcopyin)
   3910 
   3911 #endif	/* lint */
   3912 
   3913 #ifdef	lint
   3914 
   3915 /*ARGSUSED*/
   3916 int
   3917 xcopyin_little(const void *uaddr, void *kaddr, size_t count)
   3918 { return (0); }
   3919 
   3920 #else	/* lint */
   3921 
   3922 	ENTRY(xcopyin_little)
   3923 	sethi	%hi(.little_err), %o4
   3924 	ldn	[THREAD_REG + T_LOFAULT], %o5
   3925 	or	%o4, %lo(.little_err), %o4
   3926 	membar	#Sync				! sync error barrier
   3927 	stn	%o4, [THREAD_REG + T_LOFAULT]
   3928 
   3929 	subcc	%g0, %o2, %o3
   3930 	add	%o0, %o2, %o0
   3931 	bz,pn	%ncc, 2f		! check for zero bytes
   3932 	sub	%o2, 1, %o4
   3933 	add	%o0, %o4, %o0		! start w/last byte
   3934 	add	%o1, %o2, %o1
   3935 	lduba	[%o0+%o3]ASI_AIUSL, %o4
   3936 
   3937 1:	stb	%o4, [%o1+%o3]
   3938 	inccc	%o3
   3939 	sub	%o0, 2, %o0		! get next byte
   3940 	bcc,a,pt %ncc, 1b
   3941 	  lduba	[%o0+%o3]ASI_AIUSL, %o4
   3942 
   3943 2:	membar	#Sync				! sync error barrier
   3944 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   3945 	retl
   3946 	mov	%g0, %o0		! return (0)
   3947 
   3948 .little_err:
   3949 	membar	#Sync				! sync error barrier
   3950 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   3951 	retl
   3952 	mov	%g1, %o0
   3953 	SET_SIZE(xcopyin_little)
   3954 
   3955 #endif	/* lint */
   3956 
   3957 
   3958 /*
   3959  * Copy a block of storage - must not overlap (from + len <= to).
   3960  * No fault handler installed (to be called under on_fault())
   3961  */
   3962 #if defined(lint)
   3963 
   3964 /* ARGSUSED */
   3965 void
   3966 copyin_noerr(const void *ufrom, void *kto, size_t count)
   3967 {}
   3968 
   3969 #else	/* lint */
   3970 
   3971 	ENTRY(copyin_noerr)
   3972 	sethi	%hi(.copyio_noerr), REAL_LOFAULT
   3973 	b	.do_copyin
   3974 	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
   3975 .copyio_noerr:
   3976 	jmp	SAVED_LOFAULT
   3977 	  nop
   3978 	SET_SIZE(copyin_noerr)
   3979 
   3980 #endif /* lint */
   3981 
   3982 /*
   3983  * Copy a block of storage - must not overlap (from + len <= to).
   3984  * No fault handler installed (to be called under on_fault())
   3985  */
   3986 
   3987 #if defined(lint)
   3988 
   3989 /* ARGSUSED */
   3990 void
   3991 copyout_noerr(const void *kfrom, void *uto, size_t count)
   3992 {}
   3993 
   3994 #else	/* lint */
   3995 
   3996 	ENTRY(copyout_noerr)
   3997 	sethi	%hi(.copyio_noerr), REAL_LOFAULT
   3998 	b	.do_copyout
   3999 	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
   4000 	SET_SIZE(copyout_noerr)
   4001 
   4002 #endif /* lint */
   4003 
   4004 #if defined(lint)
   4005 
   4006 int use_hw_bcopy = 1;
   4007 int use_hw_bzero = 1;
   4008 uint_t hw_copy_limit_1 = 0x100;
   4009 uint_t hw_copy_limit_2 = 0x200;
   4010 uint_t hw_copy_limit_4 = 0x400;
   4011 uint_t hw_copy_limit_8 = 0x400;
   4012 
   4013 #else /* !lint */
   4014 
   4015 	.align	4
   4016 	DGDEF(use_hw_bcopy)
   4017 	.word	1
   4018 	DGDEF(use_hw_bzero)
   4019 	.word	1
   4020 	DGDEF(hw_copy_limit_1)
   4021 	.word	0x100
   4022 	DGDEF(hw_copy_limit_2)
   4023 	.word	0x200
   4024 	DGDEF(hw_copy_limit_4)
   4025 	.word	0x400
   4026 	DGDEF(hw_copy_limit_8)
   4027 	.word	0x400
   4028 
   4029 	.align	64
   4030 	.section ".text"
   4031 #endif /* !lint */
   4032 
   4033 /*
   4034  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
   4035  * longer than 256 bytes in length using Niagara's block stores/quad store.
   4036  * If the criteria for using this routine are not met then it calls bzero
   4037  * and returns 1.  Otherwise 0 is returned indicating success.
   4038  * Caller is responsible for ensuring use_hw_bzero is true and that
   4039  * kpreempt_disable() has been called.
   4040  */
   4041 #ifdef lint
   4042 /*ARGSUSED*/
   4043 int
   4044 hwblkclr(void *addr, size_t len)
   4045 {
   4046 	return(0);
   4047 }
   4048 #else /* lint */
   4049 	! %i0 - start address
   4050 	! %i1 - length of region (multiple of 64)
   4051 
   4052 	ENTRY(hwblkclr)
   4053 	save	%sp, -SA(MINFRAME), %sp
   4054 
   4055 	! Must be block-aligned
   4056 	andcc	%i0, 0x3f, %g0
   4057 	bnz,pn	%ncc, 1f
   4058 	  nop
   4059 
   4060 	! ... and must be 256 bytes or more
   4061 	cmp	%i1, 0x100
   4062 	blu,pn	%ncc, 1f
   4063 	  nop
   4064 
   4065 	! ... and length must be a multiple of 64
   4066 	andcc	%i1, 0x3f, %g0
   4067 	bz,pn	%ncc, .pz_doblock
   4068 	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
   4069 
   4070 1:	! punt, call bzero but notify the caller that bzero was used
   4071 	mov	%i0, %o0
   4072 	call	bzero
   4073 	  mov	%i1, %o1
   4074 	ret
   4075 	restore	%g0, 1, %o0	! return (1) - did not use block operations
   4076 
   4077 	! Already verified that there are at least 256 bytes to set
   4078 .pz_doblock:
   4079 	stxa	%g0, [%i0+0x0]%asi
   4080 	stxa	%g0, [%i0+0x40]%asi
   4081 	stxa	%g0, [%i0+0x80]%asi
   4082 	stxa	%g0, [%i0+0xc0]%asi
   4083 
   4084 	stxa	%g0, [%i0+0x8]%asi
   4085 	stxa	%g0, [%i0+0x10]%asi
   4086 	stxa	%g0, [%i0+0x18]%asi
   4087 	stxa	%g0, [%i0+0x20]%asi
   4088 	stxa	%g0, [%i0+0x28]%asi
   4089 	stxa	%g0, [%i0+0x30]%asi
   4090 	stxa	%g0, [%i0+0x38]%asi
   4091 
   4092 	stxa	%g0, [%i0+0x48]%asi
   4093 	stxa	%g0, [%i0+0x50]%asi
   4094 	stxa	%g0, [%i0+0x58]%asi
   4095 	stxa	%g0, [%i0+0x60]%asi
   4096 	stxa	%g0, [%i0+0x68]%asi
   4097 	stxa	%g0, [%i0+0x70]%asi
   4098 	stxa	%g0, [%i0+0x78]%asi
   4099 
   4100 	stxa	%g0, [%i0+0x88]%asi
   4101 	stxa	%g0, [%i0+0x90]%asi
   4102 	stxa	%g0, [%i0+0x98]%asi
   4103 	stxa	%g0, [%i0+0xa0]%asi
   4104 	stxa	%g0, [%i0+0xa8]%asi
   4105 	stxa	%g0, [%i0+0xb0]%asi
   4106 	stxa	%g0, [%i0+0xb8]%asi
   4107 
   4108 	stxa	%g0, [%i0+0xc8]%asi
   4109 	stxa	%g0, [%i0+0xd0]%asi
   4110 	stxa	%g0, [%i0+0xd8]%asi
   4111 	stxa	%g0, [%i0+0xe0]%asi
   4112 	stxa	%g0, [%i0+0xe8]%asi
   4113 	stxa	%g0, [%i0+0xf0]%asi
   4114 	stxa	%g0, [%i0+0xf8]%asi
   4115 
   4116 	sub	%i1, 0x100, %i1
   4117 	cmp	%i1, 0x100
   4118 	bgu,pt	%ncc, .pz_doblock
   4119 	add	%i0, 0x100, %i0
   4120 
   4121 2:
   4122 	! Check if more than 64 bytes to set
   4123 	cmp	%i1,0x40
   4124 	blu	%ncc, .pz_finish
   4125 	nop
   4126 
   4127 3:
   4128 	stxa	%g0, [%i0+0x0]%asi
   4129 	stxa	%g0, [%i0+0x8]%asi
   4130 	stxa	%g0, [%i0+0x10]%asi
   4131 	stxa	%g0, [%i0+0x18]%asi
   4132 	stxa	%g0, [%i0+0x20]%asi
   4133 	stxa	%g0, [%i0+0x28]%asi
   4134 	stxa	%g0, [%i0+0x30]%asi
   4135 	stxa	%g0, [%i0+0x38]%asi
   4136 
   4137 	subcc	%i1, 0x40, %i1
   4138 	bgu,pt	%ncc, 3b
   4139 	add	%i0, 0x40, %i0
   4140 
   4141 .pz_finish:
   4142 	membar	#Sync
   4143 	ret
   4144 	restore	%g0, 0, %o0		! return (bzero or not)
   4145 	SET_SIZE(hwblkclr)
   4146 #endif	/* lint */
   4147 
   4148 #ifdef	lint
   4149 /* Copy 32 bytes of data from src to dst using physical addresses */
   4150 /*ARGSUSED*/
   4151 void
   4152 hw_pa_bcopy32(uint64_t src, uint64_t dst)
   4153 {}
   4154 #else	/*!lint */
   4155 
   4156 	/*
   4157 	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
   4158 	 * using physical addresses.
   4159 	 */
   4160 	ENTRY_NP(hw_pa_bcopy32)
   4161 	rdpr    %pstate, %g1
   4162 	andn    %g1, PSTATE_IE, %g2
   4163 	wrpr    %g0, %g2, %pstate
   4164 
   4165 	ldxa    [%o0]ASI_MEM, %o2
   4166 	add     %o0, 8, %o0
   4167 	ldxa    [%o0]ASI_MEM, %o3
   4168 	add     %o0, 8, %o0
   4169 	ldxa    [%o0]ASI_MEM, %o4
   4170 	add     %o0, 8, %o0
   4171 	ldxa    [%o0]ASI_MEM, %o5
   4172 	stxa    %o2, [%o1]ASI_MEM
   4173 	add     %o1, 8, %o1
   4174 	stxa    %o3, [%o1]ASI_MEM
   4175 	add     %o1, 8, %o1
   4176 	stxa    %o4, [%o1]ASI_MEM
   4177 	add     %o1, 8, %o1
   4178 	stxa    %o5, [%o1]ASI_MEM
   4179 
   4180 	membar	#Sync
   4181 	retl
   4182 	  wrpr    %g0, %g1, %pstate
   4183 	SET_SIZE(hw_pa_bcopy32)
   4184 #endif /* lint */
   4185 
   4186 /*
   4187  * Zero a block of storage.
   4188  *
   4189  * uzero is used by the kernel to zero a block in user address space.
   4190  */
   4191 
   4192 /*
   4193  * Control flow of the bzero/kzero/uzero routine.
   4194  *
   4195  *	For fewer than 7 bytes stores, bytes will be zeroed.
   4196  *
   4197  *	For less than 15 bytes stores, align the address on 4 byte boundary.
   4198  *	Then store as many 4-byte chunks, followed by trailing bytes.
   4199  *
   4200  *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
   4201  *	if (count > 128) {
   4202  *		store as many 8-bytes chunks to block align the address
   4203  *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
   4204  *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
   4205  *	}
   4206  *	Store as many 8-byte chunks, followed by trailing bytes.
   4207  */
   4208 
   4209 #if defined(lint)
   4210 
   4211 /* ARGSUSED */
   4212 int
   4213 kzero(void *addr, size_t count)
   4214 { return(0); }
   4215 
   4216 /* ARGSUSED */
   4217 void
   4218 uzero(void *addr, size_t count)
   4219 {}
   4220 
   4221 #else	/* lint */
   4222 
   4223 	ENTRY(uzero)
   4224 	!
   4225 	! Set a new lo_fault handler only if we came in with one
   4226 	! already specified.
   4227 	!
   4228 	wr	%g0, ASI_USER, %asi
   4229 	ldn	[THREAD_REG + T_LOFAULT], %o5
   4230 	tst	%o5
   4231 	bz,pt	%ncc, .do_zero
   4232 	sethi	%hi(.zeroerr), %o2
   4233 	or	%o2, %lo(.zeroerr), %o2
   4234 	membar	#Sync
   4235 	ba,pt	%ncc, .do_zero
   4236 	stn	%o2, [THREAD_REG + T_LOFAULT]
   4237 
   4238 	ENTRY(kzero)
   4239 	!
   4240 	! Always set a lo_fault handler
   4241 	!
   4242 	wr	%g0, ASI_P, %asi
   4243 	ldn	[THREAD_REG + T_LOFAULT], %o5
   4244 	sethi	%hi(.zeroerr), %o2
   4245 	or	%o5, LOFAULT_SET, %o5
   4246 	or	%o2, %lo(.zeroerr), %o2
   4247 	membar	#Sync
   4248 	ba,pt	%ncc, .do_zero
   4249 	stn	%o2, [THREAD_REG + T_LOFAULT]
   4250 
   4251 /*
   4252  * We got here because of a fault during kzero or if
   4253  * uzero or bzero was called with t_lofault non-zero.
   4254  * Otherwise we've already run screaming from the room.
   4255  * Errno value is in %g1. Note that we're here iff
   4256  * we did set t_lofault.
   4257  */
   4258 .zeroerr:
   4259 	!
   4260 	! Undo asi register setting. Just set it to be the
   4261         ! kernel default without checking.
   4262 	!
   4263 	wr	%g0, ASI_P, %asi
   4264 
   4265 	!
   4266 	! We did set t_lofault. It may well have been zero coming in.
   4267 	!
   4268 1:
   4269 	tst	%o5
   4270 	membar #Sync
   4271 	bne,pn	%ncc, 3f
   4272 	andncc	%o5, LOFAULT_SET, %o5
   4273 2:
   4274 	!
   4275 	! Old handler was zero. Just return the error.
   4276 	!
   4277 	retl				! return
   4278 	mov	%g1, %o0		! error code from %g1
   4279 3:
   4280 	!
   4281 	! We're here because %o5 was non-zero. It was non-zero
   4282 	! because either LOFAULT_SET was present, a previous fault
   4283 	! handler was present or both. In all cases we need to reset
   4284 	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
   4285 	! before we either simply return the error or we invoke the
   4286 	! previously specified handler.
   4287 	!
   4288 	be	%ncc, 2b
   4289 	stn	%o5, [THREAD_REG + T_LOFAULT]
   4290 	jmp	%o5			! goto real handler
   4291 	  nop
   4292 	SET_SIZE(kzero)
   4293 	SET_SIZE(uzero)
   4294 
   4295 #endif	/* lint */
   4296 
   4297 /*
   4298  * Zero a block of storage.
   4299  */
   4300 
   4301 #if defined(lint)
   4302 
   4303 /* ARGSUSED */
   4304 void
   4305 bzero(void *addr, size_t count)
   4306 {}
   4307 
   4308 #else	/* lint */
   4309 
   4310 	ENTRY(bzero)
   4311 	wr	%g0, ASI_P, %asi
   4312 
   4313 	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
   4314 	tst	%o5
   4315 	bz,pt	%ncc, .do_zero
   4316 	sethi	%hi(.zeroerr), %o2
   4317 	or	%o2, %lo(.zeroerr), %o2
   4318 	membar	#Sync				! sync error barrier
   4319 	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
   4320 
   4321 .do_zero:
   4322 	cmp	%o1, 7
   4323 	blu,pn	%ncc, .byteclr
   4324 	nop
   4325 
   4326 	cmp	%o1, 15
   4327 	blu,pn	%ncc, .wdalign
   4328 	nop
   4329 
   4330 	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
   4331 	bz,pt	%ncc, .blkalign		! already double aligned
   4332 	sub	%o3, 8, %o3		! -(bytes till double aligned)
   4333 	add	%o1, %o3, %o1		! update o1 with new count
   4334 
   4335 1:
   4336 	stba	%g0, [%o0]%asi
   4337 	inccc	%o3
   4338 	bl,pt	%ncc, 1b
   4339 	inc	%o0
   4340 
   4341 	! Now address is double aligned
   4342 .blkalign:
   4343 	cmp	%o1, 0x80		! check if there are 128 bytes to set
   4344 	blu,pn	%ncc, .bzero_small
   4345 	mov	%o1, %o3
   4346 
   4347 	sethi	%hi(use_hw_bzero), %o2
   4348 	ld	[%o2 + %lo(use_hw_bzero)], %o2
   4349 	tst	%o2
   4350 	bz	%ncc, .bzero_small
   4351 	mov	%o1, %o3
   4352 
   4353 	rd	%asi, %o3
   4354 	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
   4355 	cmp	%o3, ASI_P
   4356 	bne,a	%ncc, .algnblk
   4357 	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
   4358 
   4359 .algnblk:
   4360 	andcc	%o0, 0x3f, %o3		! is block aligned?
   4361 	bz,pt	%ncc, .bzero_blk
   4362 	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
   4363 	add	%o1, %o3, %o1		! o1 is the remainder
   4364 
   4365 	! Clear -(%o3) bytes till block aligned
   4366 1:
   4367 	stxa	%g0, [%o0]%asi
   4368 	addcc	%o3, 8, %o3
   4369 	bl,pt	%ncc, 1b
   4370 	add	%o0, 8, %o0
   4371 
   4372 .bzero_blk:
   4373 	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
   4374 	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
   4375 
   4376 	cmp	%o4, 0x100		! 256 bytes or more
   4377 	blu,pn	%ncc, 3f
   4378 	nop
   4379 
   4380 2:
   4381 	stxa	%g0, [%o0+0x0]%asi
   4382 	stxa	%g0, [%o0+0x40]%asi
   4383 	stxa	%g0, [%o0+0x80]%asi
   4384 	stxa	%g0, [%o0+0xc0]%asi
   4385 
   4386 	stxa	%g0, [%o0+0x8]%asi
   4387 	stxa	%g0, [%o0+0x10]%asi
   4388 	stxa	%g0, [%o0+0x18]%asi
   4389 	stxa	%g0, [%o0+0x20]%asi
   4390 	stxa	%g0, [%o0+0x28]%asi
   4391 	stxa	%g0, [%o0+0x30]%asi
   4392 	stxa	%g0, [%o0+0x38]%asi
   4393 
   4394 	stxa	%g0, [%o0+0x48]%asi
   4395 	stxa	%g0, [%o0+0x50]%asi
   4396 	stxa	%g0, [%o0+0x58]%asi
   4397 	stxa	%g0, [%o0+0x60]%asi
   4398 	stxa	%g0, [%o0+0x68]%asi
   4399 	stxa	%g0, [%o0+0x70]%asi
   4400 	stxa	%g0, [%o0+0x78]%asi
   4401 
   4402 	stxa	%g0, [%o0+0x88]%asi
   4403 	stxa	%g0, [%o0+0x90]%asi
   4404 	stxa	%g0, [%o0+0x98]%asi
   4405 	stxa	%g0, [%o0+0xa0]%asi
   4406 	stxa	%g0, [%o0+0xa8]%asi
   4407 	stxa	%g0, [%o0+0xb0]%asi
   4408 	stxa	%g0, [%o0+0xb8]%asi
   4409 
   4410 	stxa	%g0, [%o0+0xc8]%asi
   4411 	stxa	%g0, [%o0+0xd0]%asi
   4412 	stxa	%g0, [%o0+0xd8]%asi
   4413 	stxa	%g0, [%o0+0xe0]%asi
   4414 	stxa	%g0, [%o0+0xe8]%asi
   4415 	stxa	%g0, [%o0+0xf0]%asi
   4416 	stxa	%g0, [%o0+0xf8]%asi
   4417 
   4418 	sub	%o4, 0x100, %o4
   4419 	cmp	%o4, 0x100
   4420 	bgu,pt	%ncc, 2b
   4421 	add	%o0, 0x100, %o0
   4422 
   4423 3:
   4424 	! ... check if 64 bytes to set
   4425 	cmp	%o4, 0x40
   4426 	blu	%ncc, .bzero_blk_done
   4427 	nop
   4428 
   4429 4:
   4430 	stxa	%g0, [%o0+0x0]%asi
   4431 	stxa	%g0, [%o0+0x8]%asi
   4432 	stxa	%g0, [%o0+0x10]%asi
   4433 	stxa	%g0, [%o0+0x18]%asi
   4434 	stxa	%g0, [%o0+0x20]%asi
   4435 	stxa	%g0, [%o0+0x28]%asi
   4436 	stxa	%g0, [%o0+0x30]%asi
   4437 	stxa	%g0, [%o0+0x38]%asi
   4438 
   4439 	subcc	%o4, 0x40, %o4
   4440 	bgu,pt	%ncc, 3b
   4441 	add	%o0, 0x40, %o0
   4442 
   4443 .bzero_blk_done:
   4444 	membar	#Sync
   4445 	!
   4446 	! Undo asi register setting.
   4447 	!
   4448 	rd	%asi, %o4
   4449 	wr	%g0, ASI_P, %asi
   4450 	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
   4451 	bne,a	%ncc, .bzero_small
   4452 	wr	%g0, ASI_USER, %asi
   4453 
   4454 .bzero_small:
   4455 	! Set the remaining doubles
   4456 	subcc	%o3, 8, %o3		! Can we store any doubles?
   4457 	blu,pn	%ncc, .byteclr
   4458 	and	%o1, 7, %o1		! calc bytes left after doubles
   4459 
   4460 .dbclr:
   4461 	stxa	%g0, [%o0]%asi		! Clear the doubles
   4462 	subcc	%o3, 8, %o3
   4463 	bgeu,pt	%ncc, .dbclr
   4464 	add	%o0, 8, %o0
   4465 
   4466 	ba	.byteclr
   4467 	nop
   4468 
   4469 .wdalign:
   4470 	andcc	%o0, 3, %o3		! is add aligned on a word boundary
   4471 	bz,pn	%ncc, .wdclr
   4472 	andn	%o1, 3, %o3		! create word sized count in %o3
   4473 
   4474 	dec	%o1			! decrement count
   4475 	stba	%g0, [%o0]%asi		! clear a byte
   4476 	ba	.wdalign
   4477 	inc	%o0			! next byte
   4478 
   4479 .wdclr:
   4480 	sta	%g0, [%o0]%asi		! 4-byte clearing loop
   4481 	subcc	%o3, 4, %o3
   4482 	bnz,pt	%ncc, .wdclr
   4483 	inc	4, %o0
   4484 
   4485 	and	%o1, 3, %o1		! leftover count, if any
   4486 
   4487 .byteclr:
   4488 	! Set the leftover bytes
   4489 	brz	%o1, .bzero_exit
   4490 	nop
   4491 
   4492 7:
   4493 	deccc	%o1			! byte clearing loop
   4494 	stba	%g0, [%o0]%asi
   4495 	bgu,pt	%ncc, 7b
   4496 	inc	%o0
   4497 
   4498 .bzero_exit:
   4499 	!
   4500 	! We're just concerned with whether t_lofault was set
   4501 	! when we came in. We end up here from either kzero()
   4502 	! or bzero(). kzero() *always* sets a lofault handler.
   4503 	! It ors LOFAULT_SET into %o5 to indicate it has done
   4504 	! this even if the value of %o5 is otherwise zero.
   4505 	! bzero() sets a lofault handler *only* if one was
   4506 	! previously set. Accordingly we need to examine
   4507 	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
   4508 	! before resetting the error handler.
   4509 	!
   4510 	tst	%o5
   4511 	bz	%ncc, 1f
   4512 	andn	%o5, LOFAULT_SET, %o5
   4513 	membar	#Sync				! sync error barrier
   4514 	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
   4515 1:
   4516 	retl
   4517 	clr	%o0			! return (0)
   4518 
   4519 	SET_SIZE(bzero)
   4520 #endif	/* lint */
   4521