1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
@ file core_asm.s
@ core asm routines
@ author cearn
@ Modified by Legolas for fpc4gba use
@
@ === NOTES ===
@ * 20050924: Lower overhead for all; reduced i-count for u16 loops.
@ * These are 16/32bit memset and memcpy. The 32bit versions are in
@ iwram for maximum effect and pretty much do what CpuFastSet does,
@ except that it'll work for non multiples of 8 words too. Speed
@ is as good as CpuFastSet, but with a little less overhead.
@ * The 16bit versions call the 32bit ones if possible and/or desirable.
@ They are thumb/ROM functions but did them in asm anyway because
@ GCC goes haywire with the use of registers resulting in a much
@ higher overhead (i.e., detrimental for low counts)
@ * Crossover with inline while(nn--) loops (not for(ii++), which are
@ much slower):
@ memcpy32: ~4
@ memset32: ~5
@ memcpy16: ~8
@ memset16: ~8
.file "core_asm.as"
@ === procedure memcpy32(dest: pointer; const src: pointer; wcount: u32); ======
@ Fast-copy by words.
@ param dest Destination address.
@ param src Source address.
@ param wcount Number of words.
@ note: src and dst must be word aligned.
@ note: r0 and r1 return as dst + wdn and src + wdn.
@ Reglist:
@ r0, r1: dst, src
@ r2: wcount, then wcount>>3
@ r3-r10: data buffer
@ r12: wcount&7
.text @ ?!?!?
@ .section .iwram,"ax", %progbits
.align 2
.code 32
.global memcpy32
memcpy32:
and r12, r2, #7
movs r2, r2, lsr #3
beq .Lres_cpy32
stmfd sp!, {r4-r10}
@ copy 32byte chunks with 8fold xxmia
.Lmain_cpy32:
ldmia r1!, {r3-r10}
stmia r0!, {r3-r10}
subs r2, r2, #1
bhi .Lmain_cpy32
ldmfd sp!, {r4-r10}
@ and the residual 0-7 words
.Lres_cpy32:
subs r12, r12, #1
ldmcsia r1!, {r3}
stmcsia r0!, {r3}
bcs .Lres_cpy32
bx lr
@ === procedure memset32(dest: pointer; wd: u32; wcount: u32); =================
@ Fast-fill by words.
@ param dest Destination address.
@ param src Fill word (not address).
@ param wcount Number of words to fill.
@ note: dst must be word aligned.
@ note: r0 returns as dst + wcount.
@ Reglist:
@ r0, r1: dst, src
@ r2: wcount, then wcount>>3
@ r3-r10: data buffer
@ r12: wcount&7
.text @?!?!?
@ .section .iwram,"ax", %progbits
.align 2
.code 32
.global memset32
memset32:
and r12, r2, #7
movs r2, r2, lsr #3
beq .Lres_set32
stmfd sp!, {r4-r10}
@ set 32byte chunks with 8fold xxmia
mov r3, r1
mov r4, r1
mov r5, r1
mov r6, r1
mov r7, r1
mov r8, r1
mov r9, r1
mov r10, r1
.Lmain_set32:
stmia r0!, {r3-r10}
subs r2, r2, #1
bhi .Lmain_set32
ldmfd sp!, {r4-r10}
@ residual 0-7 words
.Lres_set32:
subs r12, r12, #1
stmcsia r0!, {r1}
bcs .Lres_set32
bx lr
@ === procedure memcpy16(dest: pointer; const src: pointer; hwcount: u32); =====
@ Copy for halfwords.
@ Uses memcpy32() if hwcount>6 and src and dst are aligned equally.
@ param dest Destination address.
@ param src Source address.
@ param hwcount Number of halfwords to fill.
@ note: dst and src must be halfword aligned.
@ note: r0 and r1 return as dst + hwcount and src + hwcount.
@ Reglist:
@ r0, r1: dst, src
@ r2, r4: hwcount
@ r3: tmp; and data buffer
.text
.align 2
.code 16
.global memcpy16
.thumb_func
memcpy16:
push {r4, lr}
@ under 5 hwords -> std cpy
cmp r2, #5
bls .Ltail_cpy16
@ unreconcilable alignment -> std cpy
@ if (dst^src)&2 -> alignment impossible
mov r3, r0
eor r3, r1
lsl r3, r3, #31 @ (dst^src), bit 1 into carry
bcs .Ltail_cpy16 @ (dst^src)&2 : must copy by halfword
@ src and dst have same alignment -> word align
lsl r3, r0, #31
bcc .Lmain_cpy16 @ ~src&2 : already word aligned
@ aligning is necessary: copy 1 hword and align
ldrh r3, [r1]
strh r3, [r0]
add r0, #2
add r1, #2
sub r2, r2, #1
@ right, and for the REAL work, we're gonna use memcpy32
.Lmain_cpy16:
lsl r4, r2, #31
lsr r2, r2, #1
ldr r3, .Lpool_cpy16
bx r3
nop
@ NOTE: r0,r1 are altered by memcpy32, but in exactly the right
@ way, so we can use them as is.
lsr r2, r4, #31
beq .Lend_cpy16
.Ltail_cpy16:
sub r2, #1
bcc .Lend_cpy16 @ r2 was 0, bug out
lsl r2, r2, #1
.Lres_cpy16:
ldrh r3, [r1, r2]
strh r3, [r0, r2]
sub r2, r2, #2
bcs .Lres_cpy16
.Lend_cpy16:
pop {r4}
pop {r3}
bx r3
.align 2
.Lpool_cpy16:
.word memcpy32
@ === procedure memset16(dest: pointer; hw: u16; hwcount: u32); ================
@ Fill for halfwords.
@ Uses memset32() if hwcount>5
@ param dest Destination address.
@ param hw Source halfword (not address).
@ param hwcount Number of halfwords to fill.
@ note: dest must be halfword aligned.
@ note: r0 returns as dest + hwcount.
@ Reglist:
@ r0, r1: dst, hw
@ r2, r4: hwcount
@ r3: tmp; and data buffer
.text
.align 2
.code 16
.global memset16
.thumb_func
memset16:
push {r4, lr}
@ under 6 hwords -> std set
cmp r2, #5
bls .Ltail_set16
@ dst not word aligned: copy 1 hword and align
lsl r3, r0, #31
bcc .Lmain_set16
strh r1, [r0]
add r0, #2
sub r2, r2, #1
@ Again, memset32 does the real work
.Lmain_set16:
lsl r4, r1, #16
orr r1, r4
lsl r4, r2, #31
lsr r2, r2, #1
ldr r3, .Lpool_set16
bx r3
nop
@ NOTE: r0 is altered by memset32, but in exactly the right
@ way, so we can use is as is. r1 is now doubled though.
lsr r2, r4, #31
beq .Lend_set16
lsr r1, #16
.Ltail_set16:
sub r2, #1
bcc .Lend_set16 @ r2 was 0, bug out
lsl r2, r2, #1
.Lres_set16:
strh r1, [r0, r2]
sub r2, r2, #2
bcs .Lres_set16
.Lend_set16:
pop {r4}
pop {r3}
bx r3
.align 2
.Lpool_set16:
.word memset32
|