usr/src/lib/libc/sparc/gen/strcpy.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"strcpy.s"

/*
 * strcpy(s1, s2)
 *
 * Copy string s2 to s1.  s1 must be large enough. Return s1.
 *
 * Fast assembler language version of the following C-program strcpy
 * which represents the `standard' for the C-library.
 *
 *	char *
 *	strcpy(s1, s2)
 *	register char *s1;
 *	register const char *s2;
 *	{
 *		char *os1 = s1;
 *	
 *		while(*s1++ = *s2++)
 *			;
 *		return(os1);
 *	}
 *
 */

#include <sys/asm_linkage.h>

	! This is a 32-bit implementation of strcpy.  It works by
	! first checking the alignment of its source pointer. And,
	! if it is not aligned, attempts to copy bytes until it is.
	! once this has occurred, the copy takes place, while checking
	! for zero bytes, based upon destination alignment.
	! Methods exist to handle per-byte, half-word, and word sized
	! copies.

	ENTRY(strcpy)

	.align 32

	sub	%o1, %o0, %o3		! src - dst
	andcc	%o1, 3, %o4		! src word aligned ?
	bz	.srcaligned		! yup
	mov	%o0, %o2		! save dst

	cmp	%o4, 2			! src halfword aligned
	be	.s2aligned		! yup
	ldub	[%o2 + %o3], %o1	! src[0]
	tst	%o1			! byte zero?
	stb	%o1, [%o2]		! store first byte
	bz	.done			! yup, done
	cmp	%o4, 3			! only one byte needed to align?
	bz	.srcaligned		! yup
	inc	%o2			! src++, dst++  
     
.s2aligned:
	lduh	[%o2 + %o3], %o1	! src[]     
	srl	%o1, 8, %o4		! %o4<7:0> = first byte
	tst	%o4			! first byte zero ?
	bz	.done			! yup, done
	stb	%o4, [%o2]		! store first byte
	andcc	%o1, 0xff, %g0		! second byte zero ?
	bz	.done			! yup, done
	stb	%o1, [%o2 + 1]		! store second byte
	add	%o2, 2, %o2		! src += 2, dst += 2

.srcaligned:
	sethi	%hi(0x01010101), %o4	! Alan Mycroft's magic1
	sethi	%hi(0x80808080), %o5	! Alan Mycroft's magic2
	or	%o4, %lo(0x01010101), %o4
	andcc	%o2, 3, %o1		! destination word aligned?
	bnz	.dstnotaligned		! nope
	or	%o5, %lo(0x80808080), %o5

.copyword:
	lduw	[%o2 + %o3], %o1	! src word
	add	%o2, 4, %o2		! src += 4, dst += 4
	andn	%o5, %o1, %g1		! ~word & 0x80808080
	sub	%o1, %o4, %o1		! word - 0x01010101
	andcc	%o1, %g1, %g0		! ((word - 0x01010101) & ~word & 0x80808080)
	add	%o1, %o4, %o1		! restore word
	bz,a	.copyword		! no zero byte if magic expression == 0
	st	%o1, [%o2 - 4]		! store word to dst (address pre-incremented)

.zerobyte:
	set	0xff000000, %o4		! mask for 1st byte
	srl	%o1, 24, %o3		! %o3<7:0> = first byte
	andcc	%o1, %o4, %g0		! first byte zero?
	bz	.done			! yup, done
	stb	%o3, [%o2 - 4]		! store first byte  
	set	0x00ff0000, %o5		! mask for 2nd byte
	srl	%o1, 16, %o3		! %o3<7:0> = second byte    
	andcc	%o1, %o5, %g0		! second byte zero?
	bz	.done			! yup, done
	stb	%o3, [%o2 - 3]		! store second byte
	srl	%o4, 16, %o4		! 0x0000ff00 = mask for 3rd byte
	andcc	%o1, %o4, %g0		! third byte zero?
	srl	%o1, 8, %o3		! %o3<7:0> = third byte
	bz	.done			! yup, done
	stb	%o3, [%o2 - 2]		! store third byte
	stb	%o1, [%o2 - 1]		! store fourth byte

.done:
	retl				! done with leaf function
	.empty

.dstnotaligned:
	cmp	%o1, 2			! dst half word aligned?
	be,a	.storehalfword2		! yup, store half word at a time
	lduw	[%o2 + %o3], %o1	! src word

.storebyte:
	lduw	[%o2 + %o3], %o1	! src word
	add	%o2, 4, %o2		! src += 4, dst += 4
	sub	%o1, %o4, %g1		! x - 0x01010101
	andn	%g1, %o1, %g1		! (x - 0x01010101) & ~x
	andcc	%g1, %o5, %g0		! ((x - 0x01010101) & ~x & 0x80808080)
	bnz	.zerobyte		! word has zero byte, handle end cases
	srl	%o1, 24, %g1		! %g1<7:0> = first byte
	stb	%g1, [%o2 - 4]		! store first byte; half-word aligned now
	srl	%o1, 8, %g1		! %g1<15:0> = byte 2, 3
	sth	%g1, [%o2 - 3]		! store bytes 2, 3
	ba	.storebyte		! next word
	stb	%o1, [%o2 - 1]		! store fourth byte

.storehalfword:
	lduw	[%o2 + %o3], %o1	! src word
.storehalfword2:
	add	%o2, 4, %o2		! src += 4, dst += 4
	sub	%o1, %o4, %g1		! x - 0x01010101
	andn	%g1, %o1, %g1		! (x - 0x01010101) & ~x
	andcc	%g1, %o5, %g0		! ((x - 0x01010101) & ~x & 0x80808080)
	bnz	.zerobyte		! word has zero byte, handle end cases
	srl	%o1, 16, %g1		! get first and second byte
	sth	%g1, [%o2 - 4]		! store first and second byte
	ba	.storehalfword		! next word
	sth	%o1, [%o2 - 2]		! store third and fourth byte
	
	! DO NOT remove these NOPs. It will slow down the halfword loop by 15%

	nop				! padding
	nop				! padding

	SET_SIZE(strcpy)