summaryrefslogtreecommitdiff
path: root/usr/src/lib/libc/i386_hwcap1/gen/memset.s
blob: ceaf437c086693d332f573636551cae3161593fc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"memset.s"

#include <sys/asm_linkage.h>

	ANSI_PRAGMA_WEAK(memset,function)

	ENTRY(memset)
	pushl	%edi		/ save register variable
	movl	8(%esp),%edi	/ %edi = string address
	movl	12(%esp),%eax	/ %al = byte to duplicate
	movl	16(%esp),%ecx	/ %ecx = number of copies

	/ For all basic blocks in this routine, maintain the following
	/ entry conditions:	%eax each byte is set to desired byte.
	/			NOTE: .byteset doesn't require this
	/			%ecx contains # bytes to set
	/			%edi contain address to set

	cld			/ make sure we go the right way...
	cmpl	$20,%ecx	/ strings with fewer than 20 chars should be byte set
	jbe	.byteset	

	andl	$0xff, %eax	/ trim anything above low byte
	imul	$0x01010101, %eax	/ extend low byte to each byte
	
	cmpl	$256, %ecx	/ smaller areas don't benefit from alignment
	jbe	.wordset

	cmpl	$511, %ecx	/ areas smaller than this should be wordset
	jbe	.check_wordset	

	/
	/ prep work for sse temporal and non-temporal
	/

	pushl	%ebx		/ more registers are needed
	pushl	%esi		/ for alignment work

	/
	/ align address to 64 byte boundaries.
	/

	movl	%ecx, %ebx	/ save byte count
	movl	%edi, %esi	/ esi is scratch register
	andl	$63, %esi	/ bytes to align to 64 byte align addr
	neg	%esi		/ compute count of bytes 
	addl	$64, %esi	/ needed to align
	andl	$63, %esi	/ to 64 byte align addr
	jz	.sse_aligned	/ skip alignment if not needed
	subl	%esi, %ebx	/ ebx contains remainder of bytes to set
	movl	%esi, %ecx	/ alignment bytes
	shrl	$2,%ecx		/ %ecx = number of words to set
	rep; sstol
	movl	%esi,%ecx
	andl	$3,%ecx		/ %ecx = number of bytes left
	rep; sstob
	movl	%ebx, %ecx	/ remainder to be set

.sse_aligned:
	
	shr	$6, %ecx	/ number of 64 byte blocks to set

	/
	/ load xmm0 with bytes to be set
	/
	subl	$16,%esp	/ give ourselves some working room on the stack
	movl	%eax,(%esp)	/ copy eax into each of 4 bytes
	movl	%eax,4(%esp)	/ avoid pushl since it causes more interlocking
	movl	%eax,8(%esp)	/
	movl	%eax,12(%esp)	/
	movups	(%esp), %xmm0	/ unaligned load from stack into xmm0
	addl	$16,%esp	/ restore stack position
	
	cmpl	$262143, %ebx	/ blocks smaller than this allocate in the cache
	jbe	.sse_loop
	jmp	.sse_nt_loop	/ branch across alignment nops
		
	.align 16

.sse_nt_loop:	
	movntps %xmm0, (%edi)	/ block non-temporal store
	movntps %xmm0, 16(%edi)	/ use sse rather than sse2
	movntps %xmm0, 32(%edi)	/ so we work more places
	movntps %xmm0, 48(%edi)	/

	addl	$64, %edi	/ increment dest address
	dec	%ecx		/ dec count of blocks
	jnz	.sse_nt_loop	/ jump if not done

	andl	$63, %ebx	/ remainder of bytes to copy
	movl	%ebx, %ecx	/ ecx contains remainer of bytes to set
	popl	%esi		/ restore stack config
	popl	%ebx		/
#if defined(_SSE2_INSN)
	mfence
#elif defined(_SSE_INSN)
	sfence
#else
#error "Must have either SSE or SSE2"
#endif
	cmpl	$20, %ecx	/ compare and jump accordingly
	jbe	.byteset
	jmp	.wordset	

	.align 16
.sse_loop:
 	movaps %xmm0, (%edi)	/ block copy w/ SSE
	movaps %xmm0, 16(%edi)
	movaps %xmm0, 32(%edi)
	movaps %xmm0, 48(%edi)

	addl	$64, %edi	/ increment addr
	dec	%ecx		/ dec count of blocks
	jnz	.sse_loop	/ jump if not done

	andl	$63, %ebx	/ remainder of bytes to copy
	movl	%ebx, %ecx	/ in %ecx as normal
	popl	%esi		/ restore stack config
	popl	%ebx		/
	cmpl	$20, %ecx	
	jbe	.byteset
	jmp	.wordset

.check_wordset:
	movl	%edi, %edx	/ save current store ptr
	andl	$7, %edi	/ check alignment
	movl	%edx,%edi	/ %edi = string address
	jz	.wordset	/ all ok 
	

.align_wordset:	
	pushl	%ebx		/ more registers are needed
	pushl	%esi		

	movl	%ecx, %ebx
	movl	%edi, %esi
	andl	$7, %esi
	neg	%esi
	addl	$8, %esi
	andl	$7, %esi
	subl	%esi, %ebx	/ ebx contains remainder of bytes to copy
	movl	%esi, %ecx
	rep; sstob	 
	movl	%ebx, %ecx
	popl	%esi		/ restore stack config
	popl	%ebx		/

.wordset:
	movl	%ecx, %edx	/ save cont
	shrl	$2,%ecx		/ %ecx = number of words to set
	rep; sstol
	movl	%edx,%ecx
	andl	$3,%ecx		/ %ecx = number of bytes left

.byteset:
	rep; sstob
	movl	8(%esp),%eax	/ return string address
	popl	%edi		/ restore register variable
	ret
	SET_SIZE(memset)