1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
.file "memcpy.s"
#include <sys/asm_linkage.h>
ANSI_PRAGMA_WEAK(memmove,function)
ANSI_PRAGMA_WEAK(memcpy,function)
ENTRY(memmove)
movl 0+12(%esp),%ecx / get number of bytes to move
pushl %esi / save off %edi, %esi and move destination
pushl %edi
movl 8+ 4(%esp),%edi / destination buffer address
movl 8+ 8(%esp),%esi / source buffer address
movl %edi, %eax
testl %ecx,%ecx
jz .Return
cmpl %esi,%edi / if (source addr > dest addr)
leal -1(%esi,%ecx),%edx / %edx = src + size - 1
jbe .memcpy_post / jump if dst <= src
cmpl %edx,%edi
jbe .CopyLeft / jump if dst <= src + size - 1
jmp .memcpy_post
ENTRY(memcpy)
pushl %esi
pushl %edi
movl 8+4(%esp),%edi / %edi = dest address
movl %edi, %eax / save this
movl 8+8(%esp),%esi / %esi = source address
movl 8+12(%esp),%ecx/ %ecx = length of string
/ %edx scratch register
/ %eax scratch register
.memcpy_post:
nop / this really helps, don't know why
/ note: cld is perf death on P4
cmpl $63,%ecx
ja .move_sse / not worth doing sse for less
.movew:
movl %ecx,%edx / save byte cnt
shrl $2,%ecx / %ecx = number of words to move
rep ; smovl / move the words
andl $0x3,%edx / %edx = number of bytes left to move
jz .Return / %edx <= 3, so just unroll the loop
movb (%esi), %cl
movb %cl, (%edi)
decl %edx
jz .Return
movb 1(%esi), %cl
movb %cl, 1(%edi)
decl %edx
jz .Return
movb 2(%esi), %cl
movb %cl, 2(%edi)
.Return:
popl %edi / restore register variables
popl %esi
ret
.move_sse:
/
/ time to 16 byte align destination
/
andl $15, %eax
jnz .sse_unaligned / jmp if dest is unaligned
.sse: / dest is aligned, check source
movl %ecx, %edx / get byte count
shrl $6, %edx / number of 64 byte blocks to move
testl $15, %esi
jnz .sse_da / go to slow loop if source is unaligned
cmpl $65535, %ecx
ja .sse_sa_nt_loop
/
/ use aligned load since we're lucky
/
.sse_sa_loop:
prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time
movaps 0(%esi), %xmm0
movaps %xmm0, 0(%edi)
movaps 16(%esi), %xmm1
movaps %xmm1, 16(%edi)
movaps 32(%esi), %xmm2
movaps %xmm2, 32(%edi)
movaps 48(%esi), %xmm3
movaps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_sa_loop
.sse_cleanup:
andl $63, %ecx / compute remaining bytes
movl 8+4(%esp), %eax / setup return value
jz .Return
jmp .movew
/
/ use aligned load since we're lucky
/
.align 16
.sse_sa_nt_loop:
prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
movaps (%esi), %xmm0
movntps %xmm0, 0(%edi)
movaps 16(%esi), %xmm1
movntps %xmm1, 16(%edi)
movaps 32(%esi), %xmm2
movntps %xmm2, 32(%edi)
movaps 48(%esi), %xmm3
movntps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_sa_nt_loop
#if defined(_SSE2_INSN)
mfence
#elif defined(_SSE_INSN)
sfence
#else
#error "Must have either SSE or SSE2"
#endif
jmp .sse_cleanup
/
/ Make certain that destination buffer becomes aligned
/
.sse_unaligned:
neg %eax / subtract from 16 and get destination
andl $15, %eax / aligned on a 16 byte boundary
movl %ecx, %edx / saved count
subl %eax, %ecx / subtract from byte count
cmpl $64, %ecx / after aligning, will we still have 64 bytes?
cmovb %edx, %ecx / if not, restore original byte count,
cmovb 8+4(%esp), %eax / and restore return value,
jb .movew / and do a non-SSE move.
xchg %ecx, %eax / flip for copy
rep ; smovb / move the bytes
xchg %ecx, %eax / flip back
jmp .sse
.align 16
.sse_da:
cmpl $65535, %ecx
jbe .sse_da_loop
/
/ use unaligned load since source doesn't line up
/
.sse_da_nt_loop:
prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
movups 0(%esi), %xmm0
movntps %xmm0, 0(%edi)
movups 16(%esi), %xmm1
movntps %xmm1, 16(%edi)
movups 32(%esi), %xmm2
movntps %xmm2, 32(%edi)
movups 48(%esi), %xmm3
movntps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_da_nt_loop
#if defined(_SSE2_INSN)
mfence
#elif defined(_SSE_INSN)
sfence
#else
#error "Must have either SSE or SSE2"
#endif
jmp .sse_cleanup
/
/ use unaligned load since source doesn't line up
/
.align 16
.sse_da_loop:
prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
prefetcht0 568(%edi)
movups 0(%esi), %xmm0
movaps %xmm0, 0(%edi)
movups 16(%esi), %xmm1
movaps %xmm1, 16(%edi)
movups 32(%esi), %xmm2
movaps %xmm2, 32(%edi)
movups 48(%esi), %xmm3
movaps %xmm3, 48(%edi)
addl $64, %esi
addl $64, %edi
decl %edx
jnz .sse_da_loop
jmp .sse_cleanup
SET_SIZE(memcpy)
/ .CopyLeft handles the memmove case where we must perform the copy backwards,
/ because of overlap between src and dst. This is not particularly optimized.
.CopyLeft:
movl $3,%eax / heavily used constant
std / reverse direction bit (RtoL)
cmpl $12,%ecx / if (size < 12)
ja .BigCopyLeft / {
movl %edx,%esi / src = src + size - 1
leal -1(%ecx,%edi),%edi / dst = dst + size - 1
rep; smovb / do the byte copy
cld / reset direction flag to LtoR
popl %edi / }
popl %esi / restore registers
movl 4(%esp),%eax / set up return value
ret / return(dba);
.BigCopyLeft: / } else {
xchgl %edx,%ecx
movl %ecx,%esi / align source w/byte copy
leal -1(%edx,%edi),%edi
andl %eax,%ecx
jz .SkipAlignLeft
addl $1, %ecx / we need to insure that future
subl %ecx,%edx / copy is done on aligned boundary
rep; smovb
.SkipAlignLeft:
movl %edx,%ecx
subl %eax,%esi
shrl $2,%ecx / do 4 byte copy RtoL
subl %eax,%edi
rep; smovl
andl %eax,%edx / do 1 byte copy whats left
jz .CleanupReturnLeft
movl %edx,%ecx
addl %eax,%esi / rep; smovl instruction will decrement
addl %eax,%edi / %edi, %esi by four after each copy
/ adding 3 will restore pointers to byte
/ before last double word copied
/ which is where they are expected to
/ be for the single byte copy code
rep; smovb
.CleanupReturnLeft:
cld / reset direction flag to LtoR
popl %edi
popl %esi / restore registers
movl 4(%esp),%eax / set up return value
ret / return(dba);
SET_SIZE(memmove)
|