summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc/amd64/gen/strlen.s
blob: 3b412356783b802a28e94125904ddf0d4e0c6909 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, Intel Corporation
 * All rights reserved.
 */

/*
 *	strlen - calculate the length of string
 */

#include "SYS.h"
#include "proc64_id.h"

#define LABEL(s) .strlen/**/s

	/*
	 * This implementation uses SSE instructions to compare up to 16 bytes
	 * at a time looking for the end of string (null char).
	 */
	ENTRY(strlen)			/* (const char *s) */
	mov	%rdi, %rsi		/* keep original %rdi value */
	mov	%rsi, %rcx
	pxor	%xmm0, %xmm0		/* 16 null chars */
	and	$15, %rcx	
	jz	LABEL(align16_loop)	/* string is 16 byte aligned */ 		

	/*
	 * Unaligned case. Round down to 16-byte boundary before comparing
	 * 16 bytes for a null char. The code then compensates for any extra chars
	 * preceding the start of the string. 
	 */
LABEL(unalign16):
	and	$0xfffffffffffffff0, %rsi

	pcmpeqb	(%rsi), %xmm0
	lea	16(%rdi), %rsi		
	pmovmskb %xmm0, %edx

	shr	%cl, %edx		/* Compensate for bytes preceding the string */
	test	%edx, %edx
	jnz	LABEL(exit)
	sub	%rcx, %rsi		/* no null, adjust to next 16-byte boundary */
	pxor	%xmm0, %xmm0		/* clear xmm0, may have been changed... */
	
	.p2align 4
LABEL(align16_loop):			/* 16 byte aligned */
	pcmpeqb	(%rsi), %xmm0		/* look for null bytes */
	pmovmskb %xmm0, %edx		/* move each byte mask of %xmm0 to edx */

	add	$16, %rsi		/* prepare to search next 16 bytes */
	test	%edx, %edx		/* if no null byte, %edx must be 0 */
	jnz	LABEL(exit)		/* found a null */

	pcmpeqb	(%rsi), %xmm0
	pmovmskb %xmm0, %edx
	add	$16, %rsi
	test	%edx, %edx
	jnz	LABEL(exit)

	pcmpeqb	(%rsi), %xmm0
	pmovmskb %xmm0, %edx
	add	$16, %rsi
	test	%edx, %edx
	jnz	LABEL(exit)

	pcmpeqb	(%rsi), %xmm0
	pmovmskb %xmm0, %edx
	add	$16, %rsi
	test	%edx, %edx
	jz	LABEL(align16_loop)

	.p2align 4
LABEL(exit):
	neg	%rdi		
	/*
	 * Check to see if BSF is fast on this processor. If not, use a different
	 * exit tail to find first bit set indicating null byte match.
	 */
	testl	$USE_BSF, .memops_method(%rip)
	jz	LABEL(AMD_exit)

	lea	-16(%rdi, %rsi), %rax	/* calculate exact offset */	
	bsf	%edx, %ecx		/* Least significant 1 bit is index of null */	
	lea	(%rax, %rcx),%rax
	ret

	/*
	 * This exit tail does not use the bsf instruction.
	 */
	.p2align 4
LABEL(AMD_exit):
	lea	-16(%rdi, %rsi), %rax
	test	%dl, %dl	
	jz	LABEL(exit_high)
	test	$0x01, %dl
	jnz	LABEL(exit_tail0)

	test	$0x02, %dl
	jnz	LABEL(exit_tail1)

	.p2align 4		
	test	$0x04, %dl
	jnz	LABEL(exit_tail2)

	test	$0x08, %dl
	jnz	LABEL(exit_tail3)

	test	$0x10, %dl
	jnz	LABEL(exit_tail4)

	test	$0x20, %dl
	jnz	LABEL(exit_tail5)

	test	$0x40, %dl
	jnz	LABEL(exit_tail6)
	add	$7, %rax
	ret

	.p2align 4
LABEL(exit_high):
	add	$8, %rax
	test	$0x01, %dh
	jnz	LABEL(exit_tail0)

	test	$0x02, %dh
	jnz	LABEL(exit_tail1)

	test	$0x04, %dh
	jnz	LABEL(exit_tail2)

	test	$0x08, %dh
	jnz	LABEL(exit_tail3)

	test	$0x10, %dh
	jnz	LABEL(exit_tail4)

	test	$0x20, %dh
	jnz	LABEL(exit_tail5)

	test	$0x40, %dh
	jnz	LABEL(exit_tail6)
	add	$7, %rax
	ret

	.p2align 4
LABEL(exit_tail0):
	xor	%ecx, %ecx
	ret

	.p2align 4
LABEL(exit_tail1):
	add	$1, %rax
	ret

	.p2align 4
LABEL(exit_tail2):
	add	$2, %rax
	ret

	.p2align 4
LABEL(exit_tail3):
	add	$3, %rax
	ret

	.p2align 4
LABEL(exit_tail4):
	add	$4, %rax
	ret

	.p2align 4
LABEL(exit_tail5):
	add	$5, %rax
	ret

	.p2align 4
LABEL(exit_tail6):
	add	$6, %rax
	ret
	SET_SIZE(strlen)