1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, Intel Corporation
* All rights reserved.
*/
/*
* strlen - calculate the length of string
*/
#include "SYS.h"
#include "proc64_id.h"
#define LABEL(s) .strlen/**/s
/*
* This implementation uses SSE instructions to compare up to 16 bytes
* at a time looking for the end of string (null char).
*/
ENTRY(strlen) /* (const char *s) */
mov %rdi, %rsi /* keep original %rdi value */
mov %rsi, %rcx
pxor %xmm0, %xmm0 /* 16 null chars */
and $15, %rcx
jz LABEL(align16_loop) /* string is 16 byte aligned */
/*
* Unaligned case. Round down to 16-byte boundary before comparing
* 16 bytes for a null char. The code then compensates for any extra chars
* preceding the start of the string.
*/
LABEL(unalign16):
and $0xfffffffffffffff0, %rsi
pcmpeqb (%rsi), %xmm0
lea 16(%rdi), %rsi
pmovmskb %xmm0, %edx
shr %cl, %edx /* Compensate for bytes preceding the string */
test %edx, %edx
jnz LABEL(exit)
sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */
pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */
.p2align 4
LABEL(align16_loop): /* 16 byte aligned */
pcmpeqb (%rsi), %xmm0 /* look for null bytes */
pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */
add $16, %rsi /* prepare to search next 16 bytes */
test %edx, %edx /* if no null byte, %edx must be 0 */
jnz LABEL(exit) /* found a null */
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %edx
add $16, %rsi
test %edx, %edx
jnz LABEL(exit)
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %edx
add $16, %rsi
test %edx, %edx
jnz LABEL(exit)
pcmpeqb (%rsi), %xmm0
pmovmskb %xmm0, %edx
add $16, %rsi
test %edx, %edx
jz LABEL(align16_loop)
.p2align 4
LABEL(exit):
neg %rdi
/*
* Check to see if BSF is fast on this processor. If not, use a different
* exit tail to find first bit set indicating null byte match.
*/
testl $USE_BSF, .memops_method(%rip)
jz LABEL(AMD_exit)
lea -16(%rdi, %rsi), %rax /* calculate exact offset */
bsf %edx, %ecx /* Least significant 1 bit is index of null */
lea (%rax, %rcx),%rax
ret
/*
* This exit tail does not use the bsf instruction.
*/
.p2align 4
LABEL(AMD_exit):
lea -16(%rdi, %rsi), %rax
test %dl, %dl
jz LABEL(exit_high)
test $0x01, %dl
jnz LABEL(exit_tail0)
test $0x02, %dl
jnz LABEL(exit_tail1)
.p2align 4
test $0x04, %dl
jnz LABEL(exit_tail2)
test $0x08, %dl
jnz LABEL(exit_tail3)
test $0x10, %dl
jnz LABEL(exit_tail4)
test $0x20, %dl
jnz LABEL(exit_tail5)
test $0x40, %dl
jnz LABEL(exit_tail6)
add $7, %rax
ret
.p2align 4
LABEL(exit_high):
add $8, %rax
test $0x01, %dh
jnz LABEL(exit_tail0)
test $0x02, %dh
jnz LABEL(exit_tail1)
test $0x04, %dh
jnz LABEL(exit_tail2)
test $0x08, %dh
jnz LABEL(exit_tail3)
test $0x10, %dh
jnz LABEL(exit_tail4)
test $0x20, %dh
jnz LABEL(exit_tail5)
test $0x40, %dh
jnz LABEL(exit_tail6)
add $7, %rax
ret
.p2align 4
LABEL(exit_tail0):
xor %ecx, %ecx
ret
.p2align 4
LABEL(exit_tail1):
add $1, %rax
ret
.p2align 4
LABEL(exit_tail2):
add $2, %rax
ret
.p2align 4
LABEL(exit_tail3):
add $3, %rax
ret
.p2align 4
LABEL(exit_tail4):
add $4, %rax
ret
.p2align 4
LABEL(exit_tail5):
add $5, %rax
ret
.p2align 4
LABEL(exit_tail6):
add $6, %rax
ret
SET_SIZE(strlen)
|