summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc/capabilities/sun4u-us3/common/memcpy.s
blob: 5b8bbff7cc7ce99d36cc325a879dd22d23502d01 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
 */

	.file	"memcpy.s"

/*
 * memcpy(s1, s2, len)
 *
 * Copy s2 to s1, always copy n bytes.
 * Note: this C code does not work for overlapped copies.
 *       Memmove() and bcopy() do.
 *
 * Fast assembler language version of the following C-program for memcpy
 * which represents the `standard' for the C-library.
 *
 *	void * 
 *	memcpy(void *s, const void *s0, size_t n)
 *	{
 *		if (n != 0) {
 *	   	    char *s1 = s;
 *		    const char *s2 = s0;
 *		    do {
 *			*s1++ = *s2++;
 *		    } while (--n != 0);
 *		}
 *		return (s);
 *	}
 */

#include <sys/asm_linkage.h>
#include <sys/sun4asi.h>
#include <sys/trap.h>

#define	ICACHE_LINE_SIZE	64
#define	BLOCK_SIZE	64
#define	FPRS_FEF	0x4

#define SHORTCOPY	3
#define	SMALL_MAX	39
#define	MEDIUM_MAX	255
#define MED_WMAX	256	/* max copy for medium word-aligned case */
#define MED_MAX		256	/* max copy for medium longword-aligned case */

#ifndef BSTORE_SIZE
#define BSTORE_SIZE	256	/* min copy size for block store */
#endif

	ANSI_PRAGMA_WEAK(memmove,function)
	ANSI_PRAGMA_WEAK(memcpy,function)

	ENTRY(memmove)
	cmp	%o1, %o0	! if from address is >= to use forward copy
	bgeu	%ncc, .forcpy	! else use backward if ...
	sub	%o0, %o1, %o4	! get difference of two addresses
	cmp	%o2, %o4	! compare size and difference of addresses
	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
	nop

        !
        ! an overlapped copy that must be done "backwards"
        !
.ovbc:
	mov	%o0, %g1		! save dest address for return val
	add     %o1, %o2, %o1           ! get to end of source space
        add     %o0, %o2, %o0           ! get to end of destination space

	cmp	%o2, 24
	bgeu,pn	%ncc, .dbalign
	nop
	cmp	%o2, 4
	blt,pn	%ncc, .byte
	sub	%o2, 3, %o2
.byte4loop:
	ldub	[%o1-1], %o3		! load last byte
	stb	%o3, [%o0-1]		! store last byte
	sub	%o1, 4, %o1
	ldub	[%o1+2], %o3		! load 2nd from last byte
	stb	%o3, [%o0-2]		! store 2nd from last byte
	sub	%o0, 4, %o0
	ldub	[%o1+1], %o3		! load 3rd from last byte
	stb	%o3, [%o0+1]		! store 3rd from last byte
	subcc	%o2, 4, %o2
	ldub	[%o1], %o3		! load 4th from last byte
	bgu,pt	%ncc, .byte4loop
	stb	%o3, [%o0]		! store 4th from last byte
.byte:
	addcc	%o2, 3, %o2
	bz,pt	%ncc, .exit
.byteloop:
	dec	%o1			! decrement src address
	ldub	[%o1], %o3		! read a byte
	dec	%o0			! decrement dst address
	deccc	%o2			! decrement count
	bgu,pt	%ncc, .byteloop		! loop until done
	stb	%o3, [%o0]		! write byte
.exit:
	retl
	mov	%g1, %o0

	.align	16
.dbalign:
	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
	bz,pt	%ncc, .dbmed
	sub	%o2, %o5, %o2		! update count
.dbalign1:
	dec	%o1			! decrement src address
	ldub	[%o1], %o3		! read a byte
	dec	%o0			! decrement dst address
	deccc	%o5			! decrement count
	bgu,pt	%ncc, .dbalign1		! loop until done
	stb	%o3, [%o0]		! store a byte

! check for src long word alignment
.dbmed:
	andcc	%o1, 7, %g0		! chk src long word alignment
	bnz,pn	%ncc, .dbbck
	nop
!
! Following code is for overlapping copies where src and dest
! are long word aligned
!
	cmp	%o2, 4095
	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
	nop
	prefetch [%o1 - (1 * BLOCK_SIZE)], 20	! into the prefetch cache
	sub	%o2, 63, %o2		! adjust length to allow cc test
					! for end of loop
	prefetch [%o1 - (2 * BLOCK_SIZE)], 20	! into the prefetch cache
	rd	%fprs, %o3		! o3 = fprs
	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
	! So set it anyway, without checking.
	prefetch [%o1 - (3 * BLOCK_SIZE)], 20	! into the prefetch cache
	wr      %g0, 0x4, %fprs         ! fprs.fef = 1
	prefetch [%o1 - (4 * BLOCK_SIZE)], 20	! into the prefetch cache
.dbmedl64:
	prefetch [%o1 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
	ldd	[%o1-8], %d4		! load
	subcc	%o2, 64, %o2		! decrement length count
	std	%d4, [%o0-8]		! and store
	ldd	[%o1-16], %d2		! a block of 64 bytes
	sub	%o1, 64, %o1		! decrease src ptr by 64
	std	%d2, [%o0-16]
	sub	%o0, 64, %o0		! decrease dst ptr by 64
	ldd	[%o1+40], %d4
	std	%d4, [%o0+40]
	ldd	[%o1+32], %d2
	std	%d2, [%o0+32]
	ldd	[%o1+24], %d4
	std	%d4, [%o0+24]
	ldd	[%o1+16], %d2
	std	%d2, [%o0+16]
	ldd	[%o1+8], %d4
	std	%d4, [%o0+8]
	ldd	[%o1], %d2
	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
	std	%d2, [%o0]
	add	%o2, 63, %o2		! restore offset adjustment
	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs	
.dbmedl32enter:
	subcc	%o2, 31, %o2		! adjust length to allow cc test
					! for end of loop
	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
	nop
.dbmedl32:
	ldx	[%o1-8], %o4		! load
	subcc	%o2, 32, %o2		! decrement length count
	stx	%o4, [%o0-8]		! and store
	ldx	[%o1-16], %o3		! a block of 32 bytes
	sub	%o1, 32, %o1		! decrease src ptr by 32
	stx	%o3, [%o0-16]
	ldx	[%o1+8], %o4
	sub	%o0, 32, %o0		! decrease dst ptr by 32
	stx	%o4, [%o0+8]
	ldx	[%o1], %o3
	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
	stx	%o3, [%o0]
.dbmedl31:
	addcc	%o2, 16, %o2		! adjust remaining count
	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
	nop				!
	ldx	[%o1-8], %o4		! load and store 16 bytes
	sub	%o1, 16, %o1		! decrease src ptr by 16
	stx	%o4, [%o0-8]		!
	sub	%o2, 16, %o2		! decrease count by 16
	ldx	[%o1], %o3		!
	sub	%o0, 16, %o0		! decrease dst ptr by 16
	stx	%o3, [%o0]
.dbmedl15:
	addcc	%o2, 15, %o2		! restore count
	bz,pt	%ncc, .dbexit		! exit if finished
	nop
	cmp	%o2, 8
	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
	nop
	ldx	[%o1-8], %o4		! load 8 bytes
	sub	%o1, 8, %o1		! decrease src ptr by 8
	stx	%o4, [%o0-8]		! and store 8 bytes
	subcc	%o2, 8, %o2		! decrease count by 8
	bnz	%ncc, .dbremain		! exit if finished
	sub	%o0, 8, %o0		! decrease dst ptr by 8
	retl
	mov	%g1, %o0

!
! Following code is for overlapping copies where src and dest
! are not long word aligned
!
	.align	16
.dbbck:
	rd	%fprs, %o3		! o3 = fprs
 
	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
	! So set it anyway, without checking.
	wr      %g0, 0x4, %fprs         ! fprs.fef = 1

	alignaddr %o1, %g0, %o5		! align src
	ldd	[%o5], %d0		! get first 8 byte block
	andn	%o2, 7, %o4		! prepare src ptr for finishup code
	cmp	%o2, 32
	blt,pn	%ncc, .dbmv8
	sub	%o1, %o4, %o1		!
	cmp	%o2, 4095		! check for short memmoves
	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
.dbmv64:
	ldd	[%o5-8], %d2		! load 8 bytes
	ldd	[%o5-16], %d4		! load 8 bytes
	sub	%o5, 64, %o5		!
	ldd	[%o5+40], %d6		! load 8 bytes
	sub	%o0, 64, %o0		!
	ldd	[%o5+32], %d8		! load 8 bytes
	sub	%o2, 64, %o2		! 64 less bytes to copy
	ldd	[%o5+24], %d18		! load 8 bytes
	cmp	%o2, 64			! do we have < 64 bytes remaining
	ldd	[%o5+16], %d28		! load 8 bytes
	ldd	[%o5+8], %d30		! load 8 bytes
	prefetch [%o5 - (5 * BLOCK_SIZE)], 20	! into the prefetch cache
	faligndata %d2, %d0, %d10	! extract 8 bytes out
	ldd	[%o5], %d0		! load 8 bytes
	std	%d10, [%o0+56]		! store the current 8 bytes
	faligndata %d4, %d2, %d12	! extract 8 bytes out
	std	%d12, [%o0+48]		! store the current 8 bytes
	faligndata %d6, %d4, %d14	! extract 8 bytes out
	std	%d14, [%o0+40]		! store the current 8 bytes
	faligndata %d8, %d6, %d16	! extract 8 bytes out
	std	%d16, [%o0+32]		! store the current 8 bytes
	faligndata %d18, %d8, %d20	! extract 8 bytes out
	std	%d20, [%o0+24]		! store the current 8 bytes
	faligndata %d28, %d18, %d22	! extract 8 bytes out
	std	%d22, [%o0+16]		! store the current 8 bytes
	faligndata %d30, %d28, %d24	! extract 8 bytes out
	std	%d24, [%o0+8]		! store the current 8 bytes
	faligndata %d0, %d30, %d26	! extract 8 bytes out
	bgeu,pt	%ncc, .dbmv64
	std	%d26, [%o0]		! store the current 8 bytes

	cmp	%o2, 32
	blt,pn	%ncc, .dbmvx
	nop
.dbmv32:
	ldd	[%o5-8], %d2		! load 8 bytes
.dbmv32enter:
	ldd	[%o5-16], %d4		! load 8 bytes
	sub	%o5, 32, %o5		!
	ldd	[%o5+8], %d6		! load 8 bytes
	sub	%o0, 32, %o0		! 
	faligndata %d2, %d0, %d10	! extract 8 bytes out
	ldd	[%o5], %d0		! load 8 bytes
	sub     %o2,32, %o2		! 32 less bytes to copy
	std	%d10, [%o0+24]		! store the current 8 bytes
	cmp	%o2, 32			! do we have < 32 bytes remaining
	faligndata %d4, %d2, %d12	! extract 8 bytes out
	std	%d12, [%o0+16]		! store the current 8 bytes
	faligndata %d6, %d4, %d14	! extract 8 bytes out
	std	%d14, [%o0+8]		! store the current 8 bytes
	faligndata %d0, %d6, %d16	! extract 8 bytes out
	bgeu,pt	%ncc, .dbmv32
	std	%d16, [%o0]		! store the current 8 bytes
.dbmvx:
	cmp	%o2, 8			! do we have < 8 bytes remaining
	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
	nop
.dbmv8:
	ldd	[%o5-8], %d2
	sub	%o0, 8, %o0		! since we are at the end
					! when we first enter the loop
	sub     %o2, 8, %o2		! 8 less bytes to copy
	sub	%o5, 8, %o5
	cmp	%o2, 8			! do we have < 8 bytes remaining
	faligndata %d2, %d0, %d8        ! extract 8 bytes out
	std	%d8, [%o0]		! store the current 8 bytes
	bgeu,pt	%ncc, .dbmv8
	fmovd	%d2, %d0
.dbmvfinish:
	and	%o3, 0x4, %o3           ! fprs.du = fprs.dl = 0
	tst	%o2
	bz,pt	%ncc, .dbexit
	wr	%o3, %g0, %fprs         ! fprs = o3   restore fprs

.dbremain:
	cmp	%o2, 4
	blt,pn	%ncc, .dbbyte
	nop
	ldub	[%o1-1], %o3		! load last byte
	stb	%o3, [%o0-1]		! store last byte
	sub	%o1, 4, %o1
	ldub	[%o1+2], %o3		! load 2nd from last byte
	stb	%o3, [%o0-2]		! store 2nd from last byte
	sub	%o0, 4, %o0
	ldub	[%o1+1], %o3		! load 3rd from last byte
	stb	%o3, [%o0+1]		! store 3rd from last byte
	subcc	%o2, 4, %o2
	ldub	[%o1], %o3		! load 4th from last byte
	stb	%o3, [%o0]		! store 4th from last byte	
	bz,pt	%ncc, .dbexit
.dbbyte:
	dec	%o1			! decrement src address
	ldub	[%o1], %o3		! read a byte
	dec	%o0			! decrement dst address
	deccc	%o2			! decrement count
	bgu,pt	%ncc, .dbbyte		! loop until done
	stb	%o3, [%o0]		! write byte
.dbexit:
	retl
        mov     %g1, %o0
	SET_SIZE(memmove)


	.align ICACHE_LINE_SIZE
	ENTRY(memcpy)
					! adjust instruction alignment
	nop				! Do not remove, these nops affect
	nop				! icache alignment and performance
.forcpy:
	cmp	%o2, SMALL_MAX		! check for not small case
	bgu,pn	%ncc, .medium		! go to larger cases
	mov	%o0, %g1		! save %o0
	cmp	%o2, SHORTCOPY		! check for really short case
	ble,pt	%ncc, .smallleft	!
	or	%o0, %o1, %o3		! prepare alignment check
	andcc	%o3, 0x3, %g0		! test for alignment
	bz,pt	%ncc, .smallword	! branch to word aligned case
	sub	%o2, 3, %o2		! adjust count to allow cc zero test
.smallnotalign4:
	ldub	[%o1], %o3		! read byte
	subcc	%o2, 4, %o2		! reduce count by 4
	stb	%o3, [%o0]		! write byte
	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
	add	%o1, 4, %o1		! advance SRC by 4
	stb	%o3, [%o0+1]
	ldub	[%o1-2], %o3
	add	%o0, 4, %o0		! advance DST by 4
	stb	%o3, [%o0-2]
	ldub	[%o1-1], %o3
	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
	stb	%o3, [%o0-1]
	add	%o2, 3, %o2		! restore count
.smallleft:
	tst	%o2
	bz,pt	%ncc, .smallexit
	nop
.smallleft3:				! 1, 2, or 3 bytes remain
	ldub	[%o1], %o3		! load one byte
	deccc	%o2			! reduce count for cc test
	bz,pt	%ncc, .smallexit
	stb	%o3, [%o0]		! store one byte
	ldub	[%o1+1], %o3		! load second byte
	deccc	%o2
	bz,pt	%ncc, .smallexit
	stb	%o3, [%o0+1]		! store second byte
	ldub	[%o1+2], %o3		! load third byte
	stb	%o3, [%o0+2]		! store third byte
	retl
	mov	%g1, %o0		! restore %o0

	.align	16
	nop				! affects loop icache alignment
.smallwords:
	lduw	[%o1], %o3		! read word
.smallwordx:
	subcc	%o2, 8, %o2		! update count
	stw	%o3, [%o0]		! write word
	add	%o1, 8, %o1		! update SRC
	lduw	[%o1-4], %o3		! read word
	add	%o0, 8, %o0		! update DST
	bgu,pt	%ncc, .smallwords	! loop until done
	stw	%o3, [%o0-4]		! write word
	addcc	%o2, 7, %o2		! restore count
	bz,pt	%ncc, .smallexit	! check for completion
	nop
	cmp	%o2, 4			! check for 4 or more bytes left
	blt	.smallleft3		! if not, go to finish up
	nop
	lduw	[%o1], %o3
	add	%o1, 4, %o1
	subcc	%o2, 4, %o2
	stw	%o3, [%o0]
	add	%o0, 4, %o0
	bnz,pt	%ncc, .smallleft3
	nop
	retl
	mov	%g1, %o0		! restore %o0

.smallword:
	subcc	%o2, 4, %o2		! update count
	bgu,pt	%ncc, .smallwordx
	lduw	[%o1], %o3		! read word
	addcc	%o2, 3, %o2		! restore count
	bz,pt	%ncc, .smallexit
	stw	%o3, [%o0]		! write word
	deccc	%o2			! reduce count for cc test
	ldub	[%o1+4], %o3		! load one byte
	bz,pt	%ncc, .smallexit
	stb	%o3, [%o0+4]		! store one byte
	ldub	[%o1+5], %o3		! load second byte
	deccc	%o2
	bz,pt	%ncc, .smallexit
	stb	%o3, [%o0+5]		! store second byte
	ldub	[%o1+6], %o3		! load third byte
	stb	%o3, [%o0+6]		! store third byte
.smallexit:
	retl
	mov	%g1, %o0		! restore %o0
	.align 16
.medium:
	neg	%o0, %o5
	neg	%o1, %o3	
	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
	
	bz	%ncc, 2f
	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned

	sub	%o2, %o5, %o2	! update count

1:
	ldub	[%o1], %o4
	deccc	%o5
	inc	%o1
	stb	%o4, [%o0]
	bgu,pt	%ncc, 1b
	inc	%o0

	! Now DST is 8-byte aligned.  o0, o1, o2 are current.

2:
	andcc	%o1, 0x3, %g0		! test alignment
	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
					! if src, dst not aligned
	prefetch [%o1 + (1 * BLOCK_SIZE)], 20

/*
 * Handle all cases where src and dest are aligned on word
 * or long word boundaries.  Use unrolled loops for better
 * performance.  This option wins over standard large data
 * move when source and destination is in cache for medium
 * to short data moves.
 */
	andcc	%o1, 0x7, %g0		! test word alignment
	bz,pt	%ncc, .medlword		! branch to long word aligned case
	prefetch [%o1 + (2 * BLOCK_SIZE)], 20
	cmp	%o2, MED_WMAX		! limit to store buffer size
	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
	nop
	subcc	%o2, 15, %o2		! adjust length to allow cc test
					! for end of loop
	ble,pt	%ncc, .medw15		! skip big loop if less than 16
	prefetch [%o1 + (3 * BLOCK_SIZE)], 20
/*
 * no need to put prefetch in loop as prefetches have
 * already been issued for maximum loop size
 */
.medw16:
	ld	[%o1], %o4		! load
	subcc	%o2, 16, %o2		! decrement length count
	stw	%o4, [%o0]		! and store
	ld	[%o1+4], %o3		! a block of 16 bytes
	add	%o1, 16, %o1		! increase src ptr by 16
	stw	%o3, [%o0+4]
	ld	[%o1-8], %o4
	add	%o0, 16, %o0		! increase dst ptr by 16
	stw	%o4, [%o0-8]
	ld	[%o1-4], %o3
	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
	stw	%o3, [%o0-4]
.medw15:
	addcc	%o2, 15, %o2		! restore count
	bz,pt	%ncc, .medwexit		! exit if finished
	nop
	cmp	%o2, 8
	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
	nop				!
	ld	[%o1], %o4		! load 4 bytes
	subcc	%o2, 8, %o2		! decrease count by 8
	stw	%o4, [%o0]		! and store 4 bytes
	add	%o1, 8, %o1		! increase src ptr by 8
	ld	[%o1-4], %o3		! load 4 bytes
	add	%o0, 8, %o0		! increase dst ptr by 8
	stw	%o3, [%o0-4]		! and store 4 bytes
	bz	%ncc, .medwexit		! exit if finished
	nop
.medw7:					! count is ge 1, less than 8
	cmp	%o2, 3			! check for 4 bytes left
	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
	nop				!
	ld	[%o1], %o4		! load 4 bytes
	sub	%o2, 4, %o2		! decrease count by 4
	add	%o1, 4, %o1		! increase src ptr by 4
	stw	%o4, [%o0]		! and store 4 bytes
	add	%o0, 4, %o0		! increase dst ptr by 4
	tst	%o2			! check for zero bytes left
	bz	%ncc, .medwexit		! exit if finished
	nop
.medw3:					! count is known to be 1, 2, or 3
	deccc	%o2			! reduce count by one
	ldub	[%o1], %o3		! load one byte
	bz,pt	%ncc, .medwexit		! exit if last byte
	stb	%o3, [%o0]		! store one byte
	ldub	[%o1+1], %o3		! load second byte
	deccc	%o2			! reduce count by one
	bz,pt	%ncc, .medwexit		! exit if last byte
	stb	%o3, [%o0+1]		! store second byte
	ldub	[%o1+2], %o3		! load third byte
	stb	%o3, [%o0+2]		! store third byte
.medwexit:
	retl
	mov	%g1, %o0		! restore %o0
	
/*
 * Special case for handling when src and dest are both long word aligned
 * and total data to move is between SMALL_MAX and MED_MAX bytes
 */

	.align 16
	nop
.medlword:				! long word aligned
					! length > SMALL_MAX
	cmp	%o2, MED_MAX		! limit to store buffer size
	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
	nop
	subcc	%o2, 31, %o2		! adjust length to allow cc test
					! for end of loop
	ble,pt	%ncc, .medl31		! skip big loop if less than 32
	prefetch [%o1 + (3 * BLOCK_SIZE)], 20	! into the l2 cache
/*
 * no need to put prefetch in loop as prefetches have
 * already been issued for maximum loop size
 */
.medl32:
	ldx	[%o1], %o4		! load
	subcc	%o2, 32, %o2		! decrement length count
	stx	%o4, [%o0]		! and store
	ldx	[%o1+8], %o3		! a block of 32 bytes
	add	%o1, 32, %o1		! increase src ptr by 32
	stx	%o3, [%o0+8]
	ldx	[%o1-16], %o4
	add	%o0, 32, %o0		! increase dst ptr by 32
	stx	%o4, [%o0-16]
	ldx	[%o1-8], %o3
	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
	stx	%o3, [%o0-8]
.medl31:
	addcc	%o2, 16, %o2		! adjust remaining count
	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
	nop				!
	ldx	[%o1], %o4		! load and store 16 bytes
	add	%o1, 16, %o1		! increase src ptr by 16
	stx	%o4, [%o0]		!
	sub	%o2, 16, %o2		! decrease count by 16
	ldx	[%o1-8], %o3		!
	add	%o0, 16, %o0		! increase dst ptr by 16
	stx	%o3, [%o0-8]
.medl15:
	addcc	%o2, 15, %o2		! restore count
	bz,pt	%ncc, .medwexit		! exit if finished
	nop
	cmp	%o2, 8
	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
	nop
	ldx	[%o1], %o4		! load 8 bytes
	add	%o1, 8, %o1		! increase src ptr by 8
	stx	%o4, [%o0]		! and store 8 bytes
	subcc	%o2, 8, %o2		! decrease count by 8
	bz	%ncc, .medwexit		! exit if finished
	add	%o0, 8, %o0		! increase dst ptr by 8
	ba	.medw7
	nop

	.align 16
	nop
	nop
	nop
.mediumsetup:
	prefetch [%o1 + (2 * BLOCK_SIZE)], 21
.mediumrejoin:
	rd	%fprs, %o4		! check for unused FPU
	
	add	%o1, 8, %o1		! prepare to round SRC upward

	sethi	%hi(0x1234567f), %o5	! For GSR.MASK 
	or	%o5, 0x67f, %o5

	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
	bz,a	%ncc, 3f
	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
3:
	cmp	%o2, MEDIUM_MAX
	bmask	%o5, %g0, %g0

	! Compute o5 (number of bytes that need copying using the main loop).
	! First, compute for the medium case.
	! Then, if large case, o5 is replaced by count for block alignment.
	! Be careful not to read past end of SRC
	! Currently, o2 is the actual count remaining
	!            o3 is how much sooner we'll cross the alignment boundary
	!                in SRC compared to in DST
	!
	! Examples:  Let # denote bytes that should not be accessed
	!            Let x denote a byte already copied to align DST
	!            Let . and - denote bytes not yet copied
	!            Let | denote double alignment boundaries
	!
	!            DST:  ######xx|........|--------|..######   o2 = 18
	!                          o0
	!
	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
	!                          o1
	!
	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
	!                                   o1
	!
	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
	!                                   o1

	or	%g0, -8, %o5
	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1

	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
	add	%o5, %o2, %o5
	add	%o5, %o3, %o5

	bleu	%ncc, 4f
	andn	%o5, 7, %o5		! 8 byte aligned count
	neg	%o0, %o5		! 'large' case
	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
4:	
	brgez,a	%o3, .beginmedloop
	ldd	[%o1-8], %d0

	add	%o1, %o3, %o1		! back up o1
5:
	ldda	[%o1]ASI_FL8_P, %d2
	inc	%o1
	andcc	%o1, 7, %g0
	bnz	%ncc, 5b
	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2

.beginmedloop:	
	tst	%o5
	bz	%ncc, .endmedloop
	sub	%o2, %o5, %o2		! update count for later

	! Main loop to write out doubles.  Note: o5 & 7 == 0
	
	ldd	[%o1], %d2
	subcc	%o5, 8, %o5		! update local count
	bz,pn	%ncc, 1f
	add	%o1, 8, %o1		! update SRC

.medloop:
	faligndata %d0, %d2, %d4
	ldd	[%o1], %d0
	subcc	%o5, 8, %o5		! update local count
	add	%o1, 16, %o1		! update SRC
	std	%d4, [%o0]
	bz,pn	%ncc, 2f
	faligndata %d2, %d0, %d6
	ldd	[%o1 - 8], %d2
	subcc	%o5, 8, %o5		! update local count
	std	%d6, [%o0 + 8]
	bnz,pt	%ncc, .medloop
	add	%o0, 16, %o0		! update DST

1:	
	faligndata %d0, %d2, %d4
	fmovd	%d2, %d0
	std	%d4, [%o0]
	ba	.endmedloop
	add	%o0, 8, %o0
	
2:
	std	%d6, [%o0 + 8]
	sub	%o1, 8, %o1
	add	%o0, 16, %o0
	

.endmedloop:
	! Currently, o1 is pointing to the next double-aligned byte in SRC
	! The 8 bytes starting at [o1-8] are available in d0
	! At least one, and possibly all, of these need to be written.

	cmp	%o2, BLOCK_SIZE	
	bgu	%ncc, .large		! otherwise, less than 16 bytes left
	
#if 0

	/* This code will use partial stores.  */

	mov	%g0, %o5
	and	%o3, 7, %o3		! Number of bytes needed to completely
					! fill %d0 with good (unwritten) data.

	subcc	%o2, 8, %o2		! update count (maybe too much)
	movl	%ncc, %o2, %o5		
	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)

	bz	%ncc, 2f
	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
	
1:
	deccc	%o5
	ldda	[%o1]ASI_FL8_P, %d2
	inc	%o1
	bgu	%ncc, 1b
	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2

2:
	not     %o3
	faligndata %d0, %d0, %d0	! shift bytes to the left
	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
	edge8n	%g0, %o3, %o5
	stda	%d0, [%o0]%o5, ASI_PST8_P
	brlez	%o2, .mediumexit		
	add	%o0, %o3, %o0		! update DST to last stored byte
3:	
	inc	%o0
	deccc	%o2
	ldub	[%o1], %o3
	stb	%o3, [%o0]
	bgu	%ncc, 3b
	inc	%o1

#else

	andcc	%o3, 7, %o5		! Number of bytes needed to completely
					! fill %d0 with good (unwritten) data.
	bz	%ncc, 2f
	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
	cmp	%o2, 8
	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
	add	%o1, %o3, %o1 		! Back up %o1

1:
	deccc	%o5
	ldda	[%o1]ASI_FL8_P, %d2
	inc	%o1
	bgu	%ncc, 1b
	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2

2:	
	subcc	%o2, 8, %o2
	std	%d0, [%o0]
	bz	%ncc, .mediumexit
	add	%o0, 8, %o0
3:	
	ldub	[%o1], %o3
	deccc	%o2
	inc	%o1
	stb	%o3, [%o0]
	bgu	%ncc, 3b
	inc	%o0
#endif	

.mediumexit:
        wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
	retl
        mov     %g1, %o0


	.align ICACHE_LINE_SIZE
.large:
	! The following test for BSTORE_SIZE is used to decide whether
	! to store data with a block store or with individual stores.
	! The block store wins when the amount of data is so large
	! that it is causes other application data to be moved out
	! of the L1 or L2 cache.
	! On a Panther, block store can lose more often because block
	! store forces the stored data to be removed from the L3 cache.
	!
	sethi	%hi(BSTORE_SIZE),%o5
	or	%o5,%lo(BSTORE_SIZE),%o5
	cmp	%o2, %o5
	bgu	%ncc, .xlarge		

	! %o0 I/O DST is 64-byte aligned
	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
	! %d0 I/O already loaded with SRC data from [%o1-8]
	! %o2 I/O count (number of bytes that need to be written)
	! %o3 I   Not written.  If zero, then SRC is double aligned.
	! %o4 I   Not written.  Holds fprs.
	! %o5   O The number of doubles that remain to be written.

	! Load the rest of the current block 
	! Recall that %o1 is further into SRC than %o0 is into DST

	prefetch [%o0 + (0 * BLOCK_SIZE)], 22
	prefetch [%o0 + (1 * BLOCK_SIZE)], 22
	prefetch [%o0 + (2 * BLOCK_SIZE)], 22
	ldd	[%o1], %f2
	prefetch [%o1 + (3 * BLOCK_SIZE)], 21
	ldd	[%o1 + 0x8], %f4
	faligndata %f0, %f2, %f32
	ldd	[%o1 + 0x10], %f6
	faligndata %f2, %f4, %f34
	ldd	[%o1 + 0x18], %f8
	faligndata %f4, %f6, %f36
	ldd	[%o1 + 0x20], %f10
        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
	faligndata %f6, %f8, %f38
	ldd	[%o1 + 0x28], %f12
	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
	faligndata %f8, %f10, %f40
	ldd	[%o1 + 0x30], %f14
	faligndata %f10, %f12, %f42
	ldd	[%o1 + 0x38], %f0
	sub	%o2, BLOCK_SIZE, %o2	! update count
	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
	add	%o1, BLOCK_SIZE, %o1		! update SRC

	! Main loop.  Write previous block.  Load rest of current block.
	! Some bytes will be loaded that won't yet be written.
1:	
	ldd	[%o1], %f2
	faligndata %f12, %f14, %f44
	ldd	[%o1 + 0x8], %f4
	faligndata %f14, %f0, %f46
	std	%f32, [%o0]
	std	%f34, [%o0+8]
	std	%f36, [%o0+16]
	std	%f38, [%o0+24]
	std	%f40, [%o0+32]
	std	%f42, [%o0+40]
	std	%f44, [%o0+48]
	std	%f46, [%o0+56]
	sub	%o2, BLOCK_SIZE, %o2		! update count
	prefetch [%o0 + (6 * BLOCK_SIZE)], 22
	prefetch [%o0 + (3 * BLOCK_SIZE)], 22
	add	%o0, BLOCK_SIZE, %o0		! update DST
	ldd	[%o1 + 0x10], %f6
	faligndata %f0, %f2, %f32
	ldd	[%o1 + 0x18], %f8
	faligndata %f2, %f4, %f34
	ldd	[%o1 + 0x20], %f10
	faligndata %f4, %f6, %f36
	ldd	[%o1 + 0x28], %f12
	faligndata %f6, %f8, %f38
	ldd	[%o1 + 0x30], %f14
	faligndata %f8, %f10, %f40
	ldd	[%o1 + 0x38], %f0
	faligndata %f10, %f12, %f42
	cmp	%o2, BLOCK_SIZE + 8
	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
	bgu,pt	%ncc, 1b
	add	%o1, BLOCK_SIZE, %o1	! update SRC
	faligndata %f12, %f14, %f44
	faligndata %f14, %f0, %f46
	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
	cmp	%o2, BLOCK_SIZE		
	bne	%ncc, 2f		! exactly 1 block remaining?
	add	%o0, BLOCK_SIZE, %o0	! update DST
	brz,a	%o3, 3f			! is SRC double aligned?
	ldd	[%o1], %f2

2:	
	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
	add	%o5, %o3, %o5

	membar	#StoreLoad|#StoreStore

	ba	.beginmedloop
	andn	%o5, 7, %o5		! 8 byte aligned count


	! This is when there is exactly 1 block remaining and SRC is aligned
3:
	ldd	[%o1 + 0x8], %f4
	ldd	[%o1 + 0x10], %f6
	fsrc1	%f0, %f32
	ldd	[%o1 + 0x18], %f8
	fsrc1	%f2, %f34
	ldd	[%o1 + 0x20], %f10
	fsrc1	%f4, %f36
	ldd	[%o1 + 0x28], %f12
	fsrc1	%f6, %f38
	ldd	[%o1 + 0x30], %f14
	fsrc1	%f8, %f40
	fsrc1	%f10, %f42
	fsrc1	%f12, %f44
	fsrc1	%f14, %f46
	stda	%f32, [%o0]ASI_BLK_P
	membar	#StoreLoad|#StoreStore
	wr	%o4, 0, %fprs
	retl
	mov	%g1, %o0


	.align 16
	! two nops here causes loop starting at 1f below to be
	! on a cache line boundary, improving performance
	nop
	nop
.xlarge:
	! %o0 I/O DST is 64-byte aligned
	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
	! %d0 I/O already loaded with SRC data from [%o1-8]
	! %o2 I/O count (number of bytes that need to be written)
	! %o3 I   Not written.  If zero, then SRC is double aligned.
	! %o4 I   Not written.  Holds fprs.
	! %o5   O The number of doubles that remain to be written.

	! Load the rest of the current block 
	! Recall that %o1 is further into SRC than %o0 is into DST

	! prefetch [%o1 + (3 * BLOCK_SIZE)], 21
	! executed in delay slot for branch to .xlarge
	prefetch [%o1 + (4 * BLOCK_SIZE)], 21
	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
	ldd	[%o1], %f2
	prefetch [%o1 + (6 * BLOCK_SIZE)], 21
	ldd	[%o1 + 0x8], %f4
	faligndata %f0, %f2, %f32
	ldd	[%o1 + 0x10], %f6
	faligndata %f2, %f4, %f34
	ldd	[%o1 + 0x18], %f8
	faligndata %f4, %f6, %f36
	ldd	[%o1 + 0x20], %f10
        or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
	faligndata %f6, %f8, %f38
	ldd	[%o1 + 0x28], %f12
	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
	faligndata %f8, %f10, %f40
	ldd	[%o1 + 0x30], %f14
	faligndata %f10, %f12, %f42
	ldd	[%o1 + 0x38], %f0
	sub	%o2, BLOCK_SIZE, %o2	! update count
	prefetch [%o1 + (7 * BLOCK_SIZE)], 21
	add	%o1, BLOCK_SIZE, %o1	! update SRC

	! This point is 32-byte aligned since 24 instructions appear since
	! the previous alignment directive.
	

	! Main loop.  Write previous block.  Load rest of current block.
	! Some bytes will be loaded that won't yet be written.
1:
	ldd	[%o1], %f2
	faligndata %f12, %f14, %f44
	ldd	[%o1 + 0x8], %f4
	faligndata %f14, %f0, %f46
	stda	%f32, [%o0]ASI_BLK_P
	sub	%o2, BLOCK_SIZE, %o2		! update count
	ldd	[%o1 + 0x10], %f6
	faligndata %f0, %f2, %f32
	ldd	[%o1 + 0x18], %f8
	faligndata %f2, %f4, %f34
	ldd	[%o1 + 0x20], %f10
	faligndata %f4, %f6, %f36
	ldd	[%o1 + 0x28], %f12
	faligndata %f6, %f8, %f38
	ldd	[%o1 + 0x30], %f14
	faligndata %f8, %f10, %f40
	ldd	[%o1 + 0x38], %f0
	faligndata %f10, %f12, %f42
	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21
	add	%o0, BLOCK_SIZE, %o0		! update DST
	cmp	%o2, BLOCK_SIZE + 8
	! second prefetch important to correct for occasional dropped
	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
	! strong prefetch prevents drops on Panther, but Jaguar and earlier
	! US-III models treat strong prefetches as weak prefetchs
	! to avoid regressions on customer hardware, we retain the prefetch
	prefetch [%o1 + (5 * BLOCK_SIZE)], 21
	bgu,pt	%ncc, 1b
	add	%o1, BLOCK_SIZE, %o1		! update SRC

	faligndata %f12, %f14, %f44
	faligndata %f14, %f0, %f46
	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
	cmp	%o2, BLOCK_SIZE		
	bne	%ncc, 2f		! exactly 1 block remaining?
	add	%o0, BLOCK_SIZE, %o0	! update DST
	brz,a	%o3, 3f			! is SRC double aligned?
	ldd	[%o1], %f2

2:	
	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
	add	%o5, %o3, %o5

	membar	#StoreLoad|#StoreStore

	ba	.beginmedloop
	andn	%o5, 7, %o5		! 8 byte aligned count


	! This is when there is exactly 1 block remaining and SRC is aligned
3:
	ldd	[%o1 + 0x8], %f4
	ldd	[%o1 + 0x10], %f6
	fsrc1	%f0, %f32
	ldd	[%o1 + 0x18], %f8
	fsrc1	%f2, %f34
	ldd	[%o1 + 0x20], %f10
	fsrc1	%f4, %f36
	ldd	[%o1 + 0x28], %f12
	fsrc1	%f6, %f38
	ldd	[%o1 + 0x30], %f14
	fsrc1	%f8, %f40
	fsrc1	%f10, %f42
	fsrc1	%f12, %f44
	fsrc1	%f14, %f46
	stda	%f32, [%o0]ASI_BLK_P
	membar	#StoreLoad|#StoreStore
	wr	%o4, 0, %fprs
	retl
	mov	%g1, %o0
	
	SET_SIZE(memcpy)