1 files changed, 261 insertions, 0 deletions
diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s
new file mode 100644
index 000000000..f187d4267
--- /dev/null
+++ b/src/runtime/memmove_arm.s
@@ -0,0 +1,261 @@
+// Inferno's libkern/memmove-arm.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/memmove-arm.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+// TE or TS are spilled to the stack during bulk register moves.
+TS = 0
+TE = 8
+
+// Warning: the linker will use R11 to synthesize certain instructions. Please
+// take care and double check with objdump.
+FROM = 11
+N = 12
+TMP = 12				/* N and TMP don't overlap */
+TMP1 = 5
+
+RSHIFT = 5
+LSHIFT = 6
+OFFSET = 7
+
+BR0 = 0					/* shared with TS */
+BW0 = 1
+BR1 = 1
+BW1 = 2
+BR2 = 2
+BW2 = 3
+BR3 = 3
+BW3 = 4
+
+FW0 = 1
+FR0 = 2
+FW1 = 2
+FR1 = 3
+FW2 = 3
+FR2 = 4
+FW3 = 4
+FR3 = 8					/* shared with TE */
+
+TEXT runtime·memmove(SB), NOSPLIT, $4-12
+_memmove:
+	MOVW	to+0(FP), R(TS)
+	MOVW	from+4(FP), R(FROM)
+	MOVW	n+8(FP), R(N)
+
+	ADD	R(N), R(TS), R(TE)	/* to end pointer */
+
+	CMP	R(FROM), R(TS)
+	BLS	_forward
+
+_back:
+	ADD	R(N), R(FROM)		/* from end pointer */
+	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	BLT	_b1tail
+
+_b4align:				/* align destination on 4 */
+	AND.S	$3, R(TE), R(TMP)
+	BEQ	_b4aligned
+
+	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
+	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
+	B	_b4align
+
+_b4aligned:				/* is source now aligned? */
+	AND.S	$3, R(FROM), R(TMP)
+	BNE	_bunaligned
+
+	ADD	$31, R(TS), R(TMP)	/* do 32-byte chunks if possible */
+	MOVW	R(TS), savedts-4(SP)
+_b32loop:
+	CMP	R(TMP), R(TE)
+	BLS	_b4tail
+
+	MOVM.DB.W (R(FROM)), [R0-R7]
+	MOVM.DB.W [R0-R7], (R(TE))
+	B	_b32loop
+
+_b4tail:				/* do remaining words if possible */
+	MOVW	savedts-4(SP), R(TS)
+	ADD	$3, R(TS), R(TMP)
+_b4loop:
+	CMP	R(TMP), R(TE)
+	BLS	_b1tail
+
+	MOVW.W	-4(R(FROM)), R(TMP1)	/* pre-indexed */
+	MOVW.W	R(TMP1), -4(R(TE))	/* pre-indexed */
+	B	_b4loop
+
+_b1tail:				/* remaining bytes */
+	CMP	R(TE), R(TS)
+	BEQ	_return
+
+	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
+	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
+	B	_b1tail
+
+_forward:
+	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	BLT	_f1tail
+
+_f4align:				/* align destination on 4 */
+	AND.S	$3, R(TS), R(TMP)
+	BEQ	_f4aligned
+
+	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
+	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
+	B	_f4align
+
+_f4aligned:				/* is source now aligned? */
+	AND.S	$3, R(FROM), R(TMP)
+	BNE	_funaligned
+
+	SUB	$31, R(TE), R(TMP)	/* do 32-byte chunks if possible */
+	MOVW	R(TE), savedte-4(SP)
+_f32loop:
+	CMP	R(TMP), R(TS)
+	BHS	_f4tail
+
+	MOVM.IA.W (R(FROM)), [R1-R8] 
+	MOVM.IA.W [R1-R8], (R(TS))
+	B	_f32loop
+
+_f4tail:
+	MOVW	savedte-4(SP), R(TE)
+	SUB	$3, R(TE), R(TMP)	/* do remaining words if possible */
+_f4loop:
+	CMP	R(TMP), R(TS)
+	BHS	_f1tail
+
+	MOVW.P	4(R(FROM)), R(TMP1)	/* implicit write back */
+	MOVW.P	R(TMP1), 4(R(TS))	/* implicit write back */
+	B	_f4loop
+
+_f1tail:
+	CMP	R(TS), R(TE)
+	BEQ	_return
+
+	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
+	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
+	B	_f1tail
+
+_return:
+	MOVW	to+0(FP), R0
+	RET
+
+_bunaligned:
+	CMP	$2, R(TMP)		/* is R(TMP) < 2 ? */
+
+	MOVW.LT	$8, R(RSHIFT)		/* (R(n)<<24)|(R(n-1)>>8) */
+	MOVW.LT	$24, R(LSHIFT)
+	MOVW.LT	$1, R(OFFSET)
+
+	MOVW.EQ	$16, R(RSHIFT)		/* (R(n)<<16)|(R(n-1)>>16) */
+	MOVW.EQ	$16, R(LSHIFT)
+	MOVW.EQ	$2, R(OFFSET)
+
+	MOVW.GT	$24, R(RSHIFT)		/* (R(n)<<8)|(R(n-1)>>24) */
+	MOVW.GT	$8, R(LSHIFT)
+	MOVW.GT	$3, R(OFFSET)
+
+	ADD	$16, R(TS), R(TMP)	/* do 16-byte chunks if possible */
+	CMP	R(TMP), R(TE)
+	BLS	_b1tail
+
+	BIC	$3, R(FROM)		/* align source */
+	MOVW	R(TS), savedts-4(SP)
+	MOVW	(R(FROM)), R(BR0)	/* prime first block register */
+
+_bu16loop:
+	CMP	R(TMP), R(TE)
+	BLS	_bu1tail
+
+	MOVW	R(BR0)<<R(LSHIFT), R(BW3)
+	MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
+	ORR	R(BR3)>>R(RSHIFT), R(BW3)
+
+	MOVW	R(BR3)<<R(LSHIFT), R(BW2)
+	ORR	R(BR2)>>R(RSHIFT), R(BW2)
+
+	MOVW	R(BR2)<<R(LSHIFT), R(BW1)
+	ORR	R(BR1)>>R(RSHIFT), R(BW1)
+
+	MOVW	R(BR1)<<R(LSHIFT), R(BW0)
+	ORR	R(BR0)>>R(RSHIFT), R(BW0)
+
+	MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
+	B	_bu16loop
+
+_bu1tail:
+	MOVW	savedts-4(SP), R(TS)
+	ADD	R(OFFSET), R(FROM)
+	B	_b1tail
+
+_funaligned:
+	CMP	$2, R(TMP)
+
+	MOVW.LT	$8, R(RSHIFT)		/* (R(n+1)<<24)|(R(n)>>8) */
+	MOVW.LT	$24, R(LSHIFT)
+	MOVW.LT	$3, R(OFFSET)
+
+	MOVW.EQ	$16, R(RSHIFT)		/* (R(n+1)<<16)|(R(n)>>16) */
+	MOVW.EQ	$16, R(LSHIFT)
+	MOVW.EQ	$2, R(OFFSET)
+
+	MOVW.GT	$24, R(RSHIFT)		/* (R(n+1)<<8)|(R(n)>>24) */
+	MOVW.GT	$8, R(LSHIFT)
+	MOVW.GT	$1, R(OFFSET)
+
+	SUB	$16, R(TE), R(TMP)	/* do 16-byte chunks if possible */
+	CMP	R(TMP), R(TS)
+	BHS	_f1tail
+
+	BIC	$3, R(FROM)		/* align source */
+	MOVW	R(TE), savedte-4(SP)
+	MOVW.P	4(R(FROM)), R(FR3)	/* prime last block register, implicit write back */
+
+_fu16loop:
+	CMP	R(TMP), R(TS)
+	BHS	_fu1tail
+
+	MOVW	R(FR3)>>R(RSHIFT), R(FW0)
+	MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
+	ORR	R(FR0)<<R(LSHIFT), R(FW0)
+
+	MOVW	R(FR0)>>R(RSHIFT), R(FW1)
+	ORR	R(FR1)<<R(LSHIFT), R(FW1)
+
+	MOVW	R(FR1)>>R(RSHIFT), R(FW2)
+	ORR	R(FR2)<<R(LSHIFT), R(FW2)
+
+	MOVW	R(FR2)>>R(RSHIFT), R(FW3)
+	ORR	R(FR3)<<R(LSHIFT), R(FW3)
+
+	MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
+	B	_fu16loop
+
+_fu1tail:
+	MOVW	savedte-4(SP), R(TE)
+	SUB	R(OFFSET), R(FROM)
+	B	_f1tail