Imported Upstream version 1.4upstream/1.4

author: Tianon Gravi <admwiggin@gmail.com> 2015-01-15 11:54:00 -0700
committer: Tianon Gravi <admwiggin@gmail.com> 2015-01-15 11:54:00 -0700
commit: f154da9e12608589e8d5f0508f908a0c3e88a1bb (patch)
tree: f8255d51e10c6f1e0ed69702200b966c9556a431 /src/runtime
parent: 8d8329ed5dfb9622c82a9fbec6fd99a580f9c9f6 (diff)
download: golang-f154da9e12608589e8d5f0508f908a0c3e88a1bb.tar.gz
414 files changed, 80158 insertions, 0 deletions
diff --git a/src/runtime/Makefile b/src/runtime/Makefile
new file mode 100644
index 000000000..55087def0
--- /dev/null
+++ b/src/runtime/Makefile
@@ -0,0 +1,5 @@
+# Copyright 2009 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include ../Make.dist
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
new file mode 100644
index 000000000..e9ed59503
--- /dev/null
+++ b/src/runtime/alg.go
@@ -0,0 +1,352 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+const (
+	c0 = uintptr((8-ptrSize)/4*2860486313 + (ptrSize-4)/4*33054211828000289)
+	c1 = uintptr((8-ptrSize)/4*3267000013 + (ptrSize-4)/4*23344194077549503)
+)
+
+// type algorithms - known to compiler
+const (
+	alg_MEM = iota
+	alg_MEM0
+	alg_MEM8
+	alg_MEM16
+	alg_MEM32
+	alg_MEM64
+	alg_MEM128
+	alg_NOEQ
+	alg_NOEQ0
+	alg_NOEQ8
+	alg_NOEQ16
+	alg_NOEQ32
+	alg_NOEQ64
+	alg_NOEQ128
+	alg_STRING
+	alg_INTER
+	alg_NILINTER
+	alg_SLICE
+	alg_FLOAT32
+	alg_FLOAT64
+	alg_CPLX64
+	alg_CPLX128
+	alg_max
+)
+
+type typeAlg struct {
+	// function for hashing objects of this type
+	// (ptr to object, size, seed) -> hash
+	hash func(unsafe.Pointer, uintptr, uintptr) uintptr
+	// function for comparing objects of this type
+	// (ptr to object A, ptr to object B, size) -> ==?
+	equal func(unsafe.Pointer, unsafe.Pointer, uintptr) bool
+}
+
+var algarray = [alg_max]typeAlg{
+	alg_MEM:      {memhash, memequal},
+	alg_MEM0:     {memhash, memequal0},
+	alg_MEM8:     {memhash, memequal8},
+	alg_MEM16:    {memhash, memequal16},
+	alg_MEM32:    {memhash, memequal32},
+	alg_MEM64:    {memhash, memequal64},
+	alg_MEM128:   {memhash, memequal128},
+	alg_NOEQ:     {nil, nil},
+	alg_NOEQ0:    {nil, nil},
+	alg_NOEQ8:    {nil, nil},
+	alg_NOEQ16:   {nil, nil},
+	alg_NOEQ32:   {nil, nil},
+	alg_NOEQ64:   {nil, nil},
+	alg_NOEQ128:  {nil, nil},
+	alg_STRING:   {strhash, strequal},
+	alg_INTER:    {interhash, interequal},
+	alg_NILINTER: {nilinterhash, nilinterequal},
+	alg_SLICE:    {nil, nil},
+	alg_FLOAT32:  {f32hash, f32equal},
+	alg_FLOAT64:  {f64hash, f64equal},
+	alg_CPLX64:   {c64hash, c64equal},
+	alg_CPLX128:  {c128hash, c128equal},
+}
+
+const nacl = GOOS == "nacl"
+
+var useAeshash bool
+
+// in asm_*.s
+func aeshash(p unsafe.Pointer, s, h uintptr) uintptr
+func aeshash32(p unsafe.Pointer, s, h uintptr) uintptr
+func aeshash64(p unsafe.Pointer, s, h uintptr) uintptr
+func aeshashstr(p unsafe.Pointer, s, h uintptr) uintptr
+
+func memhash(p unsafe.Pointer, s, h uintptr) uintptr {
+	if !nacl && useAeshash {
+		return aeshash(p, s, h)
+	}
+
+	h ^= c0
+	for s > 0 {
+		h = (h ^ uintptr(*(*byte)(p))) * c1
+		p = add(p, 1)
+		s--
+	}
+	return h
+}
+
+func strhash(a unsafe.Pointer, s, h uintptr) uintptr {
+	return memhash((*stringStruct)(a).str, uintptr(len(*(*string)(a))), h)
+}
+
+// NOTE: Because NaN != NaN, a map can contain any
+// number of (mostly useless) entries keyed with NaNs.
+// To avoid long hash chains, we assign a random number
+// as the hash value for a NaN.
+
+func f32hash(p unsafe.Pointer, s, h uintptr) uintptr {
+	f := *(*float32)(p)
+	switch {
+	case f == 0:
+		return c1 * (c0 ^ h) // +0, -0
+	case f != f:
+		return c1 * (c0 ^ h ^ uintptr(fastrand1())) // any kind of NaN
+	default:
+		return memhash(p, 4, h)
+	}
+}
+
+func f64hash(p unsafe.Pointer, s, h uintptr) uintptr {
+	f := *(*float64)(p)
+	switch {
+	case f == 0:
+		return c1 * (c0 ^ h) // +0, -0
+	case f != f:
+		return c1 * (c0 ^ h ^ uintptr(fastrand1())) // any kind of NaN
+	default:
+		return memhash(p, 8, h)
+	}
+}
+
+func c64hash(p unsafe.Pointer, s, h uintptr) uintptr {
+	x := (*[2]float32)(p)
+	return f32hash(unsafe.Pointer(&x[1]), 4, f32hash(unsafe.Pointer(&x[0]), 4, h))
+}
+
+func c128hash(p unsafe.Pointer, s, h uintptr) uintptr {
+	x := (*[2]float64)(p)
+	return f64hash(unsafe.Pointer(&x[1]), 8, f64hash(unsafe.Pointer(&x[0]), 8, h))
+}
+
+func interhash(p unsafe.Pointer, s, h uintptr) uintptr {
+	a := (*iface)(p)
+	tab := a.tab
+	if tab == nil {
+		return h
+	}
+	t := tab._type
+	fn := goalg(t.alg).hash
+	if fn == nil {
+		panic(errorString("hash of unhashable type " + *t._string))
+	}
+	if isDirectIface(t) {
+		return c1 * fn(unsafe.Pointer(&a.data), uintptr(t.size), h^c0)
+	} else {
+		return c1 * fn(a.data, uintptr(t.size), h^c0)
+	}
+}
+
+func nilinterhash(p unsafe.Pointer, s, h uintptr) uintptr {
+	a := (*eface)(p)
+	t := a._type
+	if t == nil {
+		return h
+	}
+	fn := goalg(t.alg).hash
+	if fn == nil {
+		panic(errorString("hash of unhashable type " + *t._string))
+	}
+	if isDirectIface(t) {
+		return c1 * fn(unsafe.Pointer(&a.data), uintptr(t.size), h^c0)
+	} else {
+		return c1 * fn(a.data, uintptr(t.size), h^c0)
+	}
+}
+
+func memequal(p, q unsafe.Pointer, size uintptr) bool {
+	if p == q {
+		return true
+	}
+	return memeq(p, q, size)
+}
+
+func memequal0(p, q unsafe.Pointer, size uintptr) bool {
+	return true
+}
+func memequal8(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*int8)(p) == *(*int8)(q)
+}
+func memequal16(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*int16)(p) == *(*int16)(q)
+}
+func memequal32(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*int32)(p) == *(*int32)(q)
+}
+func memequal64(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*int64)(p) == *(*int64)(q)
+}
+func memequal128(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*[2]int64)(p) == *(*[2]int64)(q)
+}
+func f32equal(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*float32)(p) == *(*float32)(q)
+}
+func f64equal(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*float64)(p) == *(*float64)(q)
+}
+func c64equal(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*complex64)(p) == *(*complex64)(q)
+}
+func c128equal(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*complex128)(p) == *(*complex128)(q)
+}
+func strequal(p, q unsafe.Pointer, size uintptr) bool {
+	return *(*string)(p) == *(*string)(q)
+}
+func interequal(p, q unsafe.Pointer, size uintptr) bool {
+	return ifaceeq(*(*interface {
+		f()
+	})(p), *(*interface {
+		f()
+	})(q))
+}
+func nilinterequal(p, q unsafe.Pointer, size uintptr) bool {
+	return efaceeq(*(*interface{})(p), *(*interface{})(q))
+}
+func efaceeq(p, q interface{}) bool {
+	x := (*eface)(unsafe.Pointer(&p))
+	y := (*eface)(unsafe.Pointer(&q))
+	t := x._type
+	if t != y._type {
+		return false
+	}
+	if t == nil {
+		return true
+	}
+	eq := goalg(t.alg).equal
+	if eq == nil {
+		panic(errorString("comparing uncomparable type " + *t._string))
+	}
+	if isDirectIface(t) {
+		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)), uintptr(t.size))
+	}
+	return eq(x.data, y.data, uintptr(t.size))
+}
+func ifaceeq(p, q interface {
+	f()
+}) bool {
+	x := (*iface)(unsafe.Pointer(&p))
+	y := (*iface)(unsafe.Pointer(&q))
+	xtab := x.tab
+	if xtab != y.tab {
+		return false
+	}
+	if xtab == nil {
+		return true
+	}
+	t := xtab._type
+	eq := goalg(t.alg).equal
+	if eq == nil {
+		panic(errorString("comparing uncomparable type " + *t._string))
+	}
+	if isDirectIface(t) {
+		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)), uintptr(t.size))
+	}
+	return eq(x.data, y.data, uintptr(t.size))
+}
+
+// Testing adapters for hash quality tests (see hash_test.go)
+func haveGoodHash() bool {
+	return useAeshash
+}
+
+func stringHash(s string, seed uintptr) uintptr {
+	return algarray[alg_STRING].hash(noescape(unsafe.Pointer(&s)), unsafe.Sizeof(s), seed)
+}
+
+func bytesHash(b []byte, seed uintptr) uintptr {
+	s := (*sliceStruct)(unsafe.Pointer(&b))
+	return algarray[alg_MEM].hash(s.array, uintptr(s.len), seed)
+}
+
+func int32Hash(i uint32, seed uintptr) uintptr {
+	return algarray[alg_MEM32].hash(noescape(unsafe.Pointer(&i)), 4, seed)
+}
+
+func int64Hash(i uint64, seed uintptr) uintptr {
+	return algarray[alg_MEM64].hash(noescape(unsafe.Pointer(&i)), 8, seed)
+}
+
+func efaceHash(i interface{}, seed uintptr) uintptr {
+	return algarray[alg_NILINTER].hash(noescape(unsafe.Pointer(&i)), unsafe.Sizeof(i), seed)
+}
+
+func ifaceHash(i interface {
+	F()
+}, seed uintptr) uintptr {
+	return algarray[alg_INTER].hash(noescape(unsafe.Pointer(&i)), unsafe.Sizeof(i), seed)
+}
+
+// Testing adapter for memclr
+func memclrBytes(b []byte) {
+	s := (*sliceStruct)(unsafe.Pointer(&b))
+	memclr(s.array, uintptr(s.len))
+}
+
+// TODO(dvyukov): remove when Type is converted to Go and contains *typeAlg.
+func goalg(a unsafe.Pointer) *typeAlg {
+	return (*typeAlg)(a)
+}
+
+// used in asm_{386,amd64}.s
+const hashRandomBytes = 32
+
+var aeskeysched [hashRandomBytes]byte
+
+//go:noescape
+func get_random_data(rnd *unsafe.Pointer, n *int32)
+
+func init() {
+	if theGoos == "nacl" {
+		return
+	}
+
+	// Install aes hash algorithm if we have the instructions we need
+	if (cpuid_ecx&(1<<25)) != 0 && // aes (aesenc)
+		(cpuid_ecx&(1<<9)) != 0 && // sse3 (pshufb)
+		(cpuid_ecx&(1<<19)) != 0 { // sse4.1 (pinsr{d,q})
+		useAeshash = true
+		algarray[alg_MEM].hash = aeshash
+		algarray[alg_MEM8].hash = aeshash
+		algarray[alg_MEM16].hash = aeshash
+		algarray[alg_MEM32].hash = aeshash32
+		algarray[alg_MEM64].hash = aeshash64
+		algarray[alg_MEM128].hash = aeshash
+		algarray[alg_STRING].hash = aeshashstr
+		// Initialize with random data so hash collisions will be hard to engineer.
+		var rnd unsafe.Pointer
+		var n int32
+		get_random_data(&rnd, &n)
+		if n > hashRandomBytes {
+			n = hashRandomBytes
+		}
+		memmove(unsafe.Pointer(&aeskeysched[0]), rnd, uintptr(n))
+		if n < hashRandomBytes {
+			// Not very random, but better than nothing.
+			for t := nanotime(); n < hashRandomBytes; n++ {
+				aeskeysched[n] = byte(t >> uint(8*(n%8)))
+			}
+		}
+	}
+}
diff --git a/src/runtime/append_test.go b/src/runtime/append_test.go
new file mode 100644
index 000000000..a67dc9b49
--- /dev/null
+++ b/src/runtime/append_test.go
@@ -0,0 +1,190 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package runtime_test
+
+import "testing"
+
+const N = 20
+
+func BenchmarkAppend(b *testing.B) {
+	b.StopTimer()
+	x := make([]int, 0, N)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		x = x[0:0]
+		for j := 0; j < N; j++ {
+			x = append(x, j)
+		}
+	}
+}
+
+func BenchmarkAppendGrowByte(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x []byte
+		for j := 0; j < 1<<20; j++ {
+			x = append(x, byte(j))
+		}
+	}
+}
+
+func BenchmarkAppendGrowString(b *testing.B) {
+	var s string
+	for i := 0; i < b.N; i++ {
+		var x []string
+		for j := 0; j < 1<<20; j++ {
+			x = append(x, s)
+		}
+	}
+}
+
+func benchmarkAppendBytes(b *testing.B, length int) {
+	b.StopTimer()
+	x := make([]byte, 0, N)
+	y := make([]byte, length)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		x = x[0:0]
+		x = append(x, y...)
+	}
+}
+
+func BenchmarkAppend1Byte(b *testing.B) {
+	benchmarkAppendBytes(b, 1)
+}
+
+func BenchmarkAppend4Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 4)
+}
+
+func BenchmarkAppend7Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 7)
+}
+
+func BenchmarkAppend8Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 8)
+}
+
+func BenchmarkAppend15Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 15)
+}
+
+func BenchmarkAppend16Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 16)
+}
+
+func BenchmarkAppend32Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 32)
+}
+
+func benchmarkAppendStr(b *testing.B, str string) {
+	b.StopTimer()
+	x := make([]byte, 0, N)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		x = x[0:0]
+		x = append(x, str...)
+	}
+}
+
+func BenchmarkAppendStr1Byte(b *testing.B) {
+	benchmarkAppendStr(b, "1")
+}
+
+func BenchmarkAppendStr4Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "1234")
+}
+
+func BenchmarkAppendStr8Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "12345678")
+}
+
+func BenchmarkAppendStr16Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "1234567890123456")
+}
+
+func BenchmarkAppendStr32Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "12345678901234567890123456789012")
+}
+
+func BenchmarkAppendSpecialCase(b *testing.B) {
+	b.StopTimer()
+	x := make([]int, 0, N)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		x = x[0:0]
+		for j := 0; j < N; j++ {
+			if len(x) < cap(x) {
+				x = x[:len(x)+1]
+				x[len(x)-1] = j
+			} else {
+				x = append(x, j)
+			}
+		}
+	}
+}
+
+var x []int
+
+func f() int {
+	x[:1][0] = 3
+	return 2
+}
+
+func TestSideEffectOrder(t *testing.T) {
+	x = make([]int, 0, 10)
+	x = append(x, 1, f())
+	if x[0] != 1 || x[1] != 2 {
+		t.Error("append failed: ", x[0], x[1])
+	}
+}
+
+func TestAppendOverlap(t *testing.T) {
+	x := []byte("1234")
+	x = append(x[1:], x...) // p > q in runtime·appendslice.
+	got := string(x)
+	want := "2341234"
+	if got != want {
+		t.Errorf("overlap failed: got %q want %q", got, want)
+	}
+}
+
+func benchmarkCopySlice(b *testing.B, l int) {
+	s := make([]byte, l)
+	buf := make([]byte, 4096)
+	var n int
+	for i := 0; i < b.N; i++ {
+		n = copy(buf, s)
+	}
+	b.SetBytes(int64(n))
+}
+
+func benchmarkCopyStr(b *testing.B, l int) {
+	s := string(make([]byte, l))
+	buf := make([]byte, 4096)
+	var n int
+	for i := 0; i < b.N; i++ {
+		n = copy(buf, s)
+	}
+	b.SetBytes(int64(n))
+}
+
+func BenchmarkCopy1Byte(b *testing.B)    { benchmarkCopySlice(b, 1) }
+func BenchmarkCopy2Byte(b *testing.B)    { benchmarkCopySlice(b, 2) }
+func BenchmarkCopy4Byte(b *testing.B)    { benchmarkCopySlice(b, 4) }
+func BenchmarkCopy8Byte(b *testing.B)    { benchmarkCopySlice(b, 8) }
+func BenchmarkCopy12Byte(b *testing.B)   { benchmarkCopySlice(b, 12) }
+func BenchmarkCopy16Byte(b *testing.B)   { benchmarkCopySlice(b, 16) }
+func BenchmarkCopy32Byte(b *testing.B)   { benchmarkCopySlice(b, 32) }
+func BenchmarkCopy128Byte(b *testing.B)  { benchmarkCopySlice(b, 128) }
+func BenchmarkCopy1024Byte(b *testing.B) { benchmarkCopySlice(b, 1024) }
+
+func BenchmarkCopy1String(b *testing.B)    { benchmarkCopyStr(b, 1) }
+func BenchmarkCopy2String(b *testing.B)    { benchmarkCopyStr(b, 2) }
+func BenchmarkCopy4String(b *testing.B)    { benchmarkCopyStr(b, 4) }
+func BenchmarkCopy8String(b *testing.B)    { benchmarkCopyStr(b, 8) }
+func BenchmarkCopy12String(b *testing.B)   { benchmarkCopyStr(b, 12) }
+func BenchmarkCopy16String(b *testing.B)   { benchmarkCopyStr(b, 16) }
+func BenchmarkCopy32String(b *testing.B)   { benchmarkCopyStr(b, 32) }
+func BenchmarkCopy128String(b *testing.B)  { benchmarkCopyStr(b, 128) }
+func BenchmarkCopy1024String(b *testing.B) { benchmarkCopyStr(b, 1024) }
diff --git a/src/runtime/arch_386.go b/src/runtime/arch_386.go
new file mode 100644
index 000000000..79d38c7ab
--- /dev/null
+++ b/src/runtime/arch_386.go
@@ -0,0 +1,8 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+type uintreg uint32
+type intptr int32 // TODO(rsc): remove
diff --git a/src/runtime/arch_386.h b/src/runtime/arch_386.h
new file mode 100644
index 000000000..75a5ba77f
--- /dev/null
+++ b/src/runtime/arch_386.h
@@ -0,0 +1,17 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+enum {
+	thechar = '8',
+	BigEndian = 0,
+	CacheLineSize = 64,
+	RuntimeGogoBytes = 64,
+#ifdef GOOS_nacl
+	PhysPageSize = 65536,
+#else
+	PhysPageSize = 4096,
+#endif
+	PCQuantum = 1,
+	Int64Align = 4
+};
diff --git a/src/runtime/arch_amd64.go b/src/runtime/arch_amd64.go
new file mode 100644
index 000000000..270cd7b95
--- /dev/null
+++ b/src/runtime/arch_amd64.go
@@ -0,0 +1,8 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+type uintreg uint64
+type intptr int64 // TODO(rsc): remove
diff --git a/src/runtime/arch_amd64.h b/src/runtime/arch_amd64.h
new file mode 100644
index 000000000..d7b81ee90
--- /dev/null
+++ b/src/runtime/arch_amd64.h
@@ -0,0 +1,25 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+enum {
+	thechar = '6',
+	BigEndian = 0,
+	CacheLineSize = 64,
+#ifdef GOOS_solaris
+	RuntimeGogoBytes = 80,
+#else
+#ifdef GOOS_windows
+	RuntimeGogoBytes = 80,
+#else
+#ifdef GOOS_plan9
+	RuntimeGogoBytes = 80,
+#else
+	RuntimeGogoBytes = 64,
+#endif	// Plan 9
+#endif	// Windows
+#endif	// Solaris
+	PhysPageSize = 4096,
+	PCQuantum = 1,
+	Int64Align = 8
+};
diff --git a/src/runtime/arch_amd64p32.go b/src/runtime/arch_amd64p32.go
new file mode 100644
index 000000000..5c636aeab
--- /dev/null
+++ b/src/runtime/arch_amd64p32.go
@@ -0,0 +1,8 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+type uintreg uint64
+type intptr int32 // TODO(rsc): remove
diff --git a/src/runtime/arch_amd64p32.h b/src/runtime/arch_amd64p32.h
new file mode 100644
index 000000000..d3e864987
--- /dev/null
+++ b/src/runtime/arch_amd64p32.h
@@ -0,0 +1,17 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+enum {
+	thechar = '6',
+	BigEndian = 0,
+	CacheLineSize = 64,
+	RuntimeGogoBytes = 64,
+#ifdef GOOS_nacl
+	PhysPageSize = 65536,
+#else
+	PhysPageSize = 4096,
+#endif
+	PCQuantum = 1,
+	Int64Align = 8
+};
diff --git a/src/runtime/arch_arm.go b/src/runtime/arch_arm.go
new file mode 100644
index 000000000..79d38c7ab
--- /dev/null
+++ b/src/runtime/arch_arm.go
@@ -0,0 +1,8 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+type uintreg uint32
+type intptr int32 // TODO(rsc): remove
diff --git a/src/runtime/arch_arm.h b/src/runtime/arch_arm.h
new file mode 100644
index 000000000..637a334a0
--- /dev/null
+++ b/src/runtime/arch_arm.h
@@ -0,0 +1,17 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+enum {
+	thechar = '5',
+	BigEndian = 0,
+	CacheLineSize = 32,
+	RuntimeGogoBytes = 60,
+#ifdef GOOS_nacl
+	PhysPageSize = 65536,
+#else
+	PhysPageSize = 4096,
+#endif
+	PCQuantum = 4,
+	Int64Align = 4
+};
diff --git a/src/runtime/asm.s b/src/runtime/asm.s
new file mode 100644
index 000000000..e6d782f37
--- /dev/null
+++ b/src/runtime/asm.s
@@ -0,0 +1,14 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// funcdata for functions with no local variables in frame.
+// Define two zero-length bitmaps, because the same index is used
+// for the local variables as for the argument frame, and assembly
+// frames have two argument bitmaps, one without results and one with results.
+DATA runtime·no_pointers_stackmap+0x00(SB)/4, $2
+DATA runtime·no_pointers_stackmap+0x04(SB)/4, $0
+GLOBL runtime·no_pointers_stackmap(SB),RODATA, $8
+
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
new file mode 100644
index 000000000..b4b81d739
--- /dev/null
+++ b/src/runtime/asm_386.s
@@ -0,0 +1,2292 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// copy arguments forward on an even stack
+	MOVL	argc+0(FP), AX
+	MOVL	argv+4(FP), BX
+	SUBL	$128, SP		// plenty of scratch
+	ANDL	$~15, SP
+	MOVL	AX, 120(SP)		// save argc, argv away
+	MOVL	BX, 124(SP)
+
+	// set default stack bounds.
+	// _cgo_init may update stackguard.
+	MOVL	$runtime·g0(SB), BP
+	LEAL	(-64*1024+104)(SP), BX
+	MOVL	BX, g_stackguard0(BP)
+	MOVL	BX, g_stackguard1(BP)
+	MOVL	BX, (g_stack+stack_lo)(BP)
+	MOVL	SP, (g_stack+stack_hi)(BP)
+	
+	// find out information about the processor we're on
+	MOVL	$0, AX
+	CPUID
+	CMPL	AX, $0
+	JE	nocpuinfo
+	MOVL	$1, AX
+	CPUID
+	MOVL	CX, runtime·cpuid_ecx(SB)
+	MOVL	DX, runtime·cpuid_edx(SB)
+nocpuinfo:	
+
+	// if there is an _cgo_init, call it to let it
+	// initialize and to set up GS.  if not,
+	// we set up GS ourselves.
+	MOVL	_cgo_init(SB), AX
+	TESTL	AX, AX
+	JZ	needtls
+	MOVL	$setg_gcc<>(SB), BX
+	MOVL	BX, 4(SP)
+	MOVL	BP, 0(SP)
+	CALL	AX
+
+	// update stackguard after _cgo_init
+	MOVL	$runtime·g0(SB), CX
+	MOVL	(g_stack+stack_lo)(CX), AX
+	ADDL	$const_StackGuard, AX
+	MOVL	AX, g_stackguard0(CX)
+	MOVL	AX, g_stackguard1(CX)
+
+	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
+	CMPL runtime·iswindows(SB), $0
+	JEQ ok
+needtls:
+	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
+	CMPL	runtime·isplan9(SB), $1
+	JEQ	ok
+
+	// set up %gs
+	CALL	runtime·ldt0setup(SB)
+
+	// store through it, to make sure it works
+	get_tls(BX)
+	MOVL	$0x123, g(BX)
+	MOVL	runtime·tls0(SB), AX
+	CMPL	AX, $0x123
+	JEQ	ok
+	MOVL	AX, 0	// abort
+ok:
+	// set up m and g "registers"
+	get_tls(BX)
+	LEAL	runtime·g0(SB), CX
+	MOVL	CX, g(BX)
+	LEAL	runtime·m0(SB), AX
+
+	// save m->g0 = g0
+	MOVL	CX, m_g0(AX)
+	// save g0->m = m0
+	MOVL	AX, g_m(CX)
+
+	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
+
+	// convention is D is always cleared
+	CLD
+
+	CALL	runtime·check(SB)
+
+	// saved argc, argv
+	MOVL	120(SP), AX
+	MOVL	AX, 0(SP)
+	MOVL	124(SP), AX
+	MOVL	AX, 4(SP)
+	CALL	runtime·args(SB)
+	CALL	runtime·osinit(SB)
+	CALL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	PUSHL	$runtime·main·f(SB)	// entry
+	PUSHL	$0	// arg size
+	CALL	runtime·newproc(SB)
+	POPL	AX
+	POPL	AX
+
+	// start this M
+	CALL	runtime·mstart(SB)
+
+	INT $3
+	RET
+
+DATA	runtime·main·f+0(SB)/4,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),RODATA,$4
+
+TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
+	INT $3
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT,$0-0
+	// Linux and MinGW start the FPU in extended double precision.
+	// Other operating systems use double precision.
+	// Change to double precision to match them,
+	// and to match other hardware that only has double.
+	PUSHL $0x27F
+	FLDCW	0(SP)
+	POPL AX
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT, $0-4
+	MOVL	buf+0(FP), AX		// gobuf
+	LEAL	buf+0(FP), BX		// caller's SP
+	MOVL	BX, gobuf_sp(AX)
+	MOVL	0(SP), BX		// caller's PC
+	MOVL	BX, gobuf_pc(AX)
+	MOVL	$0, gobuf_ret(AX)
+	MOVL	$0, gobuf_ctxt(AX)
+	get_tls(CX)
+	MOVL	g(CX), BX
+	MOVL	BX, gobuf_g(AX)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $0-4
+	MOVL	buf+0(FP), BX		// gobuf
+	MOVL	gobuf_g(BX), DX
+	MOVL	0(DX), CX		// make sure g != nil
+	get_tls(CX)
+	MOVL	DX, g(CX)
+	MOVL	gobuf_sp(BX), SP	// restore SP
+	MOVL	gobuf_ret(BX), AX
+	MOVL	gobuf_ctxt(BX), DX
+	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
+	MOVL	$0, gobuf_ret(BX)
+	MOVL	$0, gobuf_ctxt(BX)
+	MOVL	gobuf_pc(BX), BX
+	JMP	BX
+
+// func mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB), NOSPLIT, $0-4
+	MOVL	fn+0(FP), DI
+	
+	get_tls(CX)
+	MOVL	g(CX), AX	// save state in g->sched
+	MOVL	0(SP), BX	// caller's PC
+	MOVL	BX, (g_sched+gobuf_pc)(AX)
+	LEAL	fn+0(FP), BX	// caller's SP
+	MOVL	BX, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to m->g0 & its stack, call fn
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
+	MOVL	m_g0(BX), SI
+	CMPL	SI, AX	// if g == m->g0 call badmcall
+	JNE	3(PC)
+	MOVL	$runtime·badmcall(SB), AX
+	JMP	AX
+	MOVL	SI, g(CX)	// g = m->g0
+	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
+	PUSHL	AX
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+	POPL	AX
+	MOVL	$runtime·badmcall2(SB), AX
+	JMP	AX
+	RET
+
+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-0
+	RET
+
+// func onM_signalok(fn func())
+TEXT runtime·onM_signalok(SB), NOSPLIT, $0-4
+	get_tls(CX)
+	MOVL	g(CX), AX	// AX = g
+	MOVL	g_m(AX), BX	// BX = m
+	MOVL	m_gsignal(BX), DX	// DX = gsignal
+	CMPL	AX, DX
+	JEQ	ongsignal
+	JMP	runtime·onM(SB)
+
+ongsignal:
+	MOVL	fn+0(FP), DI	// DI = fn
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+	RET
+
+// func onM(fn func())
+TEXT runtime·onM(SB), NOSPLIT, $0-4
+	MOVL	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVL	g(CX), AX	// AX = g
+	MOVL	g_m(AX), BX	// BX = m
+
+	MOVL	m_g0(BX), DX	// DX = g0
+	CMPL	AX, DX
+	JEQ	onm
+
+	MOVL	m_curg(BX), BP
+	CMPL	AX, BP
+	JEQ	oncurg
+	
+	// Not g0, not curg. Must be gsignal, but that's not allowed.
+	// Hide call from linker nosplit analysis.
+	MOVL	$runtime·badonm(SB), AX
+	CALL	AX
+
+oncurg:
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVL	$runtime·switchtoM(SB), (g_sched+gobuf_pc)(AX)
+	MOVL	SP, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to g0
+	MOVL	DX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(DX), BX
+	// make it look like mstart called onM on g0, to stop traceback
+	SUBL	$4, BX
+	MOVL	$runtime·mstart(SB), DX
+	MOVL	DX, 0(BX)
+	MOVL	BX, SP
+
+	// call target function
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), BX
+	MOVL	m_curg(BX), AX
+	MOVL	AX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(AX), SP
+	MOVL	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+onm:
+	// already on m stack, just call directly
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$0-0
+	// Cannot grow scheduler stack (m->g0).
+	get_tls(CX)
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
+	MOVL	m_g0(BX), SI
+	CMPL	g(CX), SI
+	JNE	2(PC)
+	INT	$3
+
+	// Cannot grow signal stack.
+	MOVL	m_gsignal(BX), SI
+	CMPL	g(CX), SI
+	JNE	2(PC)
+	INT	$3
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVL	4(SP), DI	// f's caller's PC
+	MOVL	DI, (m_morebuf+gobuf_pc)(BX)
+	LEAL	8(SP), CX	// f's caller's SP
+	MOVL	CX, (m_morebuf+gobuf_sp)(BX)
+	get_tls(CX)
+	MOVL	g(CX), SI
+	MOVL	SI, (m_morebuf+gobuf_g)(BX)
+
+	// Set g->sched to context in f.
+	MOVL	0(SP), AX	// f's PC
+	MOVL	AX, (g_sched+gobuf_pc)(SI)
+	MOVL	SI, (g_sched+gobuf_g)(SI)
+	LEAL	4(SP), AX	// f's SP
+	MOVL	AX, (g_sched+gobuf_sp)(SI)
+	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
+
+	// Call newstack on m->g0's stack.
+	MOVL	m_g0(BX), BP
+	MOVL	BP, g(CX)
+	MOVL	(g_sched+gobuf_sp)(BP), AX
+	MOVL	-4(AX), BX	// fault if CALL would, before smashing SP
+	MOVL	AX, SP
+	CALL	runtime·newstack(SB)
+	MOVL	$0, 0x1003	// crash if newstack returns
+	RET
+
+TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
+	MOVL	$0, DX
+	JMP runtime·morestack(SB)
+
+// reflectcall: call a function with the given argument list
+// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	CMPL	CX, $MAXSIZE;		\
+	JA	3(PC);			\
+	MOVL	$NAME(SB), AX;		\
+	JMP	AX
+// Note: can't just "JMP NAME(SB)" - bad inlining results.
+
+TEXT ·reflectcall(SB), NOSPLIT, $0-16
+	MOVL	argsize+8(FP), CX
+	DISPATCH(runtime·call16, 16)
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVL	$runtime·badreflectcall(SB), AX
+	JMP	AX
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-16;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVL	argptr+4(FP), SI;		\
+	MOVL	argsize+8(FP), CX;		\
+	MOVL	SP, DI;				\
+	REP;MOVSB;				\
+	/* call function */			\
+	MOVL	f+0(FP), DX;			\
+	MOVL	(DX), AX; 			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	CALL	AX;				\
+	/* copy return values back */		\
+	MOVL	argptr+4(FP), DI;		\
+	MOVL	argsize+8(FP), CX;		\
+	MOVL	retoffset+12(FP), BX;		\
+	MOVL	SP, SI;				\
+	ADDL	BX, DI;				\
+	ADDL	BX, SI;				\
+	SUBL	BX, CX;				\
+	REP;MOVSB;				\
+	RET
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+// bool cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·cas(SB), NOSPLIT, $0-13
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	JZ 4(PC)
+	MOVL	$0, AX
+	MOVB	AX, ret+12(FP)
+	RET
+	MOVL	$1, AX
+	MOVB	AX, ret+12(FP)
+	RET
+
+TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
+	JMP	runtime·cas(SB)
+
+TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
+	JMP	runtime·atomicload(SB)
+
+TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
+	JMP	runtime·atomicload(SB)
+
+TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
+	JMP	runtime·atomicstore(SB)
+
+// bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime·cas64(SB), NOSPLIT, $0-21
+	MOVL	ptr+0(FP), BP
+	MOVL	old_lo+4(FP), AX
+	MOVL	old_hi+8(FP), DX
+	MOVL	new_lo+12(FP), BX
+	MOVL	new_hi+16(FP), CX
+	LOCK
+	CMPXCHG8B	0(BP)
+	JNZ	cas64_fail
+	MOVL	$1, AX
+	MOVB	AX, ret+20(FP)
+	RET
+cas64_fail:
+	MOVL	$0, AX
+	MOVB	AX, ret+20(FP)
+	RET
+
+// bool casp(void **p, void *old, void *new)
+// Atomically:
+//	if(*p == old){
+//		*p = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·casp(SB), NOSPLIT, $0-13
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	JZ 4(PC)
+	MOVL	$0, AX
+	MOVB	AX, ret+12(FP)
+	RET
+	MOVL	$1, AX
+	MOVB	AX, ret+12(FP)
+	RET
+
+// uint32 xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime·xadd(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·xchg(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·xchgp(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
+	JMP	runtime·xchg(SB)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	MOVL	cycles+0(FP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
+TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+// uint64 atomicload64(uint64 volatile* addr);
+TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), AX
+	LEAL	ret_lo+4(FP), BX
+	// MOVQ (%EAX), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
+	// MOVQ %MM0, 0(%EBX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	RET
+
+// void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
+TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), AX
+	// MOVQ and EMMS were introduced on the Pentium MMX.
+	// MOVQ 0x8(%ESP), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
+	// MOVQ %MM0, (%EAX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	// This is essentially a no-op, but it provides required memory fencing.
+	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
+	MOVL	$0, AX
+	LOCK
+	XADDL	AX, (SP)
+	RET
+
+// void	runtime·atomicor8(byte volatile*, byte);
+TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), AX
+	MOVB	val+4(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void jmpdefer(fn, sp);
+// called from deferreturn.
+// 1. pop the caller
+// 2. sub 5 bytes from the callers return
+// 3. jmp to the argument
+TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
+	MOVL	fv+0(FP), DX	// fn
+	MOVL	argp+4(FP), BX	// caller sp
+	LEAL	-4(BX), SP	// caller sp after CALL
+	SUBL	$5, (SP)	// return to CALL again
+	MOVL	0(DX), BX
+	JMP	BX	// but first run the deferred function
+
+// Save state of caller into g->sched.
+TEXT gosave<>(SB),NOSPLIT,$0
+	PUSHL	AX
+	PUSHL	BX
+	get_tls(BX)
+	MOVL	g(BX), BX
+	LEAL	arg+0(FP), AX
+	MOVL	AX, (g_sched+gobuf_sp)(BX)
+	MOVL	-4(AX), AX
+	MOVL	AX, (g_sched+gobuf_pc)(BX)
+	MOVL	$0, (g_sched+gobuf_ret)(BX)
+	MOVL	$0, (g_sched+gobuf_ctxt)(BX)
+	POPL	BX
+	POPL	AX
+	RET
+
+// asmcgocall(void(*fn)(void*), void *arg)
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.c for more details.
+TEXT ·asmcgocall(SB),NOSPLIT,$0-8
+	MOVL	fn+0(FP), AX
+	MOVL	arg+4(FP), BX
+	CALL	asmcgocall<>(SB)
+	RET
+
+TEXT ·asmcgocall_errno(SB),NOSPLIT,$0-12
+	MOVL	fn+0(FP), AX
+	MOVL	arg+4(FP), BX
+	CALL	asmcgocall<>(SB)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT asmcgocall<>(SB),NOSPLIT,$0-0
+	// fn in AX, arg in BX
+	MOVL	SP, DX
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	get_tls(CX)
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
+	MOVL	m_g0(BP), SI
+	MOVL	g(CX), DI
+	CMPL	SI, DI
+	JEQ	4(PC)
+	CALL	gosave<>(SB)
+	MOVL	SI, g(CX)
+	MOVL	(g_sched+gobuf_sp)(SI), SP
+
+	// Now on a scheduling stack (a pthread-created stack).
+	SUBL	$32, SP
+	ANDL	$~15, SP	// alignment, perhaps unnecessary
+	MOVL	DI, 8(SP)	// save g
+	MOVL	(g_stack+stack_hi)(DI), DI
+	SUBL	DX, DI
+	MOVL	DI, 4(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
+	MOVL	BX, 0(SP)	// first argument in x86-32 ABI
+	CALL	AX
+
+	// Restore registers, g, stack pointer.
+	get_tls(CX)
+	MOVL	8(SP), DI
+	MOVL	(g_stack+stack_hi)(DI), SI
+	SUBL	4(SP), SI
+	MOVL	DI, g(CX)
+	MOVL	SI, SP
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
+	LEAL	fn+0(FP), AX
+	MOVL	AX, 0(SP)
+	MOVL	frame+4(FP), AX
+	MOVL	AX, 4(SP)
+	MOVL	framesize+8(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	$runtime·cgocallback_gofunc(SB), AX
+	CALL	AX
+	RET
+
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
+	NO_LOCAL_POINTERS
+
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call through AX.
+	get_tls(CX)
+#ifdef GOOS_windows
+	MOVL	$0, BP
+	CMPL	CX, $0
+	JEQ	2(PC) // TODO
+#endif
+	MOVL	g(CX), BP
+	CMPL	BP, $0
+	JEQ	needm
+	MOVL	g_m(BP), BP
+	MOVL	BP, DX // saved copy of oldm
+	JMP	havem
+needm:
+	MOVL	$0, 0(SP)
+	MOVL	$runtime·needm(SB), AX
+	CALL	AX
+	MOVL	0(SP), DX
+	get_tls(CX)
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then onM will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVL	m_g0(BP), SI
+	MOVL	SP, (g_sched+gobuf_sp)(SI)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
+	MOVL	m_g0(BP), SI
+	MOVL	(g_sched+gobuf_sp)(SI), AX
+	MOVL	AX, 0(SP)
+	MOVL	SP, (g_sched+gobuf_sp)(SI)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	//
+	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
+	// 4(SP) and 8(SP) are unused.
+	MOVL	m_curg(BP), SI
+	MOVL	SI, g(CX)
+	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
+	MOVL	(g_sched+gobuf_pc)(SI), BP
+	MOVL	BP, -4(DI)
+	LEAL	-(4+12)(DI), SP
+	MOVL	DX, 0(SP)
+	CALL	runtime·cgocallbackg(SB)
+	MOVL	0(SP), DX
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	get_tls(CX)
+	MOVL	g(CX), SI
+	MOVL	12(SP), BP
+	MOVL	BP, (g_sched+gobuf_pc)(SI)
+	LEAL	(12+4)(SP), DI
+	MOVL	DI, (g_sched+gobuf_sp)(SI)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
+	MOVL	m_g0(BP), SI
+	MOVL	SI, g(CX)
+	MOVL	(g_sched+gobuf_sp)(SI), SP
+	MOVL	0(SP), AX
+	MOVL	AX, (g_sched+gobuf_sp)(SI)
+	
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	CMPL	DX, $0
+	JNE 3(PC)
+	MOVL	$runtime·dropm(SB), AX
+	CALL	AX
+
+	// Done!
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-4
+	MOVL	gg+0(FP), BX
+#ifdef GOOS_windows
+	CMPL	BX, $0
+	JNE	settls
+	MOVL	$0, 0x14(FS)
+	RET
+settls:
+	MOVL	g_m(BX), AX
+	LEAL	m_tls(AX), AX
+	MOVL	AX, 0x14(FS)
+#endif
+	get_tls(CX)
+	MOVL	BX, g(CX)
+	RET
+
+// void setg_gcc(G*); set g. for use by gcc
+TEXT setg_gcc<>(SB), NOSPLIT, $0
+	get_tls(AX)
+	MOVL	gg+0(FP), DX
+	MOVL	DX, g(AX)
+	RET
+
+// check that SP is in range [g->stack.lo, g->stack.hi)
+TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
+	get_tls(CX)
+	MOVL	g(CX), AX
+	CMPL	(g_stack+stack_hi)(AX), SP
+	JHI	2(PC)
+	INT	$3
+	CMPL	SP, (g_stack+stack_lo)(AX)
+	JHI	2(PC)
+	INT	$3
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$0-8
+	MOVL	argp+0(FP),AX		// addr of first arg
+	MOVL	-4(AX),AX		// get calling pc
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·gogetcallerpc(SB),NOSPLIT,$0-8
+	MOVL	p+0(FP),AX		// addr of first arg
+	MOVL	-4(AX),AX		// get calling pc
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8
+	MOVL	argp+0(FP),AX		// addr of first arg
+	MOVL	pc+4(FP), BX
+	MOVL	BX, -4(AX)		// set calling pc
+	RET
+
+TEXT runtime·getcallersp(SB), NOSPLIT, $0-8
+	MOVL	argp+0(FP), AX
+	MOVL	AX, ret+4(FP)
+	RET
+
+// func gogetcallersp(p unsafe.Pointer) uintptr
+TEXT runtime·gogetcallersp(SB),NOSPLIT,$0-8
+	MOVL	p+0(FP),AX		// addr of first arg
+	MOVL	AX, ret+4(FP)
+	RET
+
+// int64 runtime·cputicks(void), so really
+// void runtime·cputicks(int64 *ticks)
+TEXT runtime·cputicks(SB),NOSPLIT,$0-8
+	RDTSC
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
+	// set up ldt 7 to point at tls0
+	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
+	// the entry number is just a hint.  setldt will set up GS with what it used.
+	MOVL	$7, 0(SP)
+	LEAL	runtime·tls0(SB), AX
+	MOVL	AX, 4(SP)
+	MOVL	$32, 8(SP)	// sizeof(tls array)
+	CALL	runtime·setldt(SB)
+	RET
+
+TEXT runtime·emptyfunc(SB),0,$0-0
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT,$0-0
+	INT $0x3
+
+// hash function using AES hardware instructions
+TEXT runtime·aeshash(SB),NOSPLIT,$0-16
+	MOVL	p+0(FP), AX	// ptr to data
+	MOVL	s+4(FP), CX	// size
+	JMP	runtime·aeshashbody(SB)
+
+TEXT runtime·aeshashstr(SB),NOSPLIT,$0-16
+	MOVL	p+0(FP), AX	// ptr to string object
+	// s+4(FP) is ignored, it is always sizeof(String)
+	MOVL	4(AX), CX	// length of string
+	MOVL	(AX), AX	// string data
+	JMP	runtime·aeshashbody(SB)
+
+// AX: data
+// CX: length
+TEXT runtime·aeshashbody(SB),NOSPLIT,$0-16
+	MOVL	h+8(FP), X0	// seed to low 32 bits of xmm0
+	PINSRD	$1, CX, X0	// size to next 32 bits of xmm0
+	MOVO	runtime·aeskeysched+0(SB), X2
+	MOVO	runtime·aeskeysched+16(SB), X3
+	CMPL	CX, $16
+	JB	aessmall
+aesloop:
+	CMPL	CX, $16
+	JBE	aesloopend
+	MOVOU	(AX), X1
+	AESENC	X2, X0
+	AESENC	X1, X0
+	SUBL	$16, CX
+	ADDL	$16, AX
+	JMP	aesloop
+// 1-16 bytes remaining
+aesloopend:
+	// This load may overlap with the previous load above.
+	// We'll hash some bytes twice, but that's ok.
+	MOVOU	-16(AX)(CX*1), X1
+	JMP	partial
+// 0-15 bytes
+aessmall:
+	TESTL	CX, CX
+	JE	finalize	// 0 bytes
+
+	CMPB	AX, $0xf0
+	JA	highpartial
+
+	// 16 bytes loaded at this address won't cross
+	// a page boundary, so we can load it directly.
+	MOVOU	(AX), X1
+	ADDL	CX, CX
+	PAND	masks<>(SB)(CX*8), X1
+	JMP	partial
+highpartial:
+	// address ends in 1111xxxx.  Might be up against
+	// a page boundary, so load ending at last byte.
+	// Then shift bytes down using pshufb.
+	MOVOU	-16(AX)(CX*1), X1
+	ADDL	CX, CX
+	PSHUFB	shifts<>(SB)(CX*8), X1
+partial:
+	// incorporate partial block into hash
+	AESENC	X3, X0
+	AESENC	X1, X0
+finalize:	
+	// finalize hash
+	AESENC	X2, X0
+	AESENC	X3, X0
+	AESENC	X2, X0
+	MOVL	X0, ret+12(FP)
+	RET
+
+TEXT runtime·aeshash32(SB),NOSPLIT,$0-16
+	MOVL	p+0(FP), AX	// ptr to data
+	// s+4(FP) is ignored, it is always sizeof(int32)
+	MOVL	h+8(FP), X0	// seed
+	PINSRD	$1, (AX), X0	// data
+	AESENC	runtime·aeskeysched+0(SB), X0
+	AESENC	runtime·aeskeysched+16(SB), X0
+	AESENC	runtime·aeskeysched+0(SB), X0
+	MOVL	X0, ret+12(FP)
+	RET
+
+TEXT runtime·aeshash64(SB),NOSPLIT,$0-16
+	MOVL	p+0(FP), AX	// ptr to data
+	// s+4(FP) is ignored, it is always sizeof(int64)
+	MOVQ	(AX), X0	// data
+	PINSRD	$2, h+8(FP), X0	// seed
+	AESENC	runtime·aeskeysched+0(SB), X0
+	AESENC	runtime·aeskeysched+16(SB), X0
+	AESENC	runtime·aeskeysched+0(SB), X0
+	MOVL	X0, ret+12(FP)
+	RET
+
+// simple mask to get rid of data in the high part of the register.
+DATA masks<>+0x00(SB)/4, $0x00000000
+DATA masks<>+0x04(SB)/4, $0x00000000
+DATA masks<>+0x08(SB)/4, $0x00000000
+DATA masks<>+0x0c(SB)/4, $0x00000000
+	
+DATA masks<>+0x10(SB)/4, $0x000000ff
+DATA masks<>+0x14(SB)/4, $0x00000000
+DATA masks<>+0x18(SB)/4, $0x00000000
+DATA masks<>+0x1c(SB)/4, $0x00000000
+	
+DATA masks<>+0x20(SB)/4, $0x0000ffff
+DATA masks<>+0x24(SB)/4, $0x00000000
+DATA masks<>+0x28(SB)/4, $0x00000000
+DATA masks<>+0x2c(SB)/4, $0x00000000
+	
+DATA masks<>+0x30(SB)/4, $0x00ffffff
+DATA masks<>+0x34(SB)/4, $0x00000000
+DATA masks<>+0x38(SB)/4, $0x00000000
+DATA masks<>+0x3c(SB)/4, $0x00000000
+	
+DATA masks<>+0x40(SB)/4, $0xffffffff
+DATA masks<>+0x44(SB)/4, $0x00000000
+DATA masks<>+0x48(SB)/4, $0x00000000
+DATA masks<>+0x4c(SB)/4, $0x00000000
+	
+DATA masks<>+0x50(SB)/4, $0xffffffff
+DATA masks<>+0x54(SB)/4, $0x000000ff
+DATA masks<>+0x58(SB)/4, $0x00000000
+DATA masks<>+0x5c(SB)/4, $0x00000000
+	
+DATA masks<>+0x60(SB)/4, $0xffffffff
+DATA masks<>+0x64(SB)/4, $0x0000ffff
+DATA masks<>+0x68(SB)/4, $0x00000000
+DATA masks<>+0x6c(SB)/4, $0x00000000
+	
+DATA masks<>+0x70(SB)/4, $0xffffffff
+DATA masks<>+0x74(SB)/4, $0x00ffffff
+DATA masks<>+0x78(SB)/4, $0x00000000
+DATA masks<>+0x7c(SB)/4, $0x00000000
+	
+DATA masks<>+0x80(SB)/4, $0xffffffff
+DATA masks<>+0x84(SB)/4, $0xffffffff
+DATA masks<>+0x88(SB)/4, $0x00000000
+DATA masks<>+0x8c(SB)/4, $0x00000000
+	
+DATA masks<>+0x90(SB)/4, $0xffffffff
+DATA masks<>+0x94(SB)/4, $0xffffffff
+DATA masks<>+0x98(SB)/4, $0x000000ff
+DATA masks<>+0x9c(SB)/4, $0x00000000
+	
+DATA masks<>+0xa0(SB)/4, $0xffffffff
+DATA masks<>+0xa4(SB)/4, $0xffffffff
+DATA masks<>+0xa8(SB)/4, $0x0000ffff
+DATA masks<>+0xac(SB)/4, $0x00000000
+	
+DATA masks<>+0xb0(SB)/4, $0xffffffff
+DATA masks<>+0xb4(SB)/4, $0xffffffff
+DATA masks<>+0xb8(SB)/4, $0x00ffffff
+DATA masks<>+0xbc(SB)/4, $0x00000000
+	
+DATA masks<>+0xc0(SB)/4, $0xffffffff
+DATA masks<>+0xc4(SB)/4, $0xffffffff
+DATA masks<>+0xc8(SB)/4, $0xffffffff
+DATA masks<>+0xcc(SB)/4, $0x00000000
+	
+DATA masks<>+0xd0(SB)/4, $0xffffffff
+DATA masks<>+0xd4(SB)/4, $0xffffffff
+DATA masks<>+0xd8(SB)/4, $0xffffffff
+DATA masks<>+0xdc(SB)/4, $0x000000ff
+	
+DATA masks<>+0xe0(SB)/4, $0xffffffff
+DATA masks<>+0xe4(SB)/4, $0xffffffff
+DATA masks<>+0xe8(SB)/4, $0xffffffff
+DATA masks<>+0xec(SB)/4, $0x0000ffff
+	
+DATA masks<>+0xf0(SB)/4, $0xffffffff
+DATA masks<>+0xf4(SB)/4, $0xffffffff
+DATA masks<>+0xf8(SB)/4, $0xffffffff
+DATA masks<>+0xfc(SB)/4, $0x00ffffff
+
+GLOBL masks<>(SB),RODATA,$256
+
+// these are arguments to pshufb.  They move data down from
+// the high bytes of the register to the low bytes of the register.
+// index is how many bytes to move.
+DATA shifts<>+0x00(SB)/4, $0x00000000
+DATA shifts<>+0x04(SB)/4, $0x00000000
+DATA shifts<>+0x08(SB)/4, $0x00000000
+DATA shifts<>+0x0c(SB)/4, $0x00000000
+	
+DATA shifts<>+0x10(SB)/4, $0xffffff0f
+DATA shifts<>+0x14(SB)/4, $0xffffffff
+DATA shifts<>+0x18(SB)/4, $0xffffffff
+DATA shifts<>+0x1c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x20(SB)/4, $0xffff0f0e
+DATA shifts<>+0x24(SB)/4, $0xffffffff
+DATA shifts<>+0x28(SB)/4, $0xffffffff
+DATA shifts<>+0x2c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
+DATA shifts<>+0x34(SB)/4, $0xffffffff
+DATA shifts<>+0x38(SB)/4, $0xffffffff
+DATA shifts<>+0x3c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
+DATA shifts<>+0x44(SB)/4, $0xffffffff
+DATA shifts<>+0x48(SB)/4, $0xffffffff
+DATA shifts<>+0x4c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
+DATA shifts<>+0x54(SB)/4, $0xffffff0f
+DATA shifts<>+0x58(SB)/4, $0xffffffff
+DATA shifts<>+0x5c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
+DATA shifts<>+0x64(SB)/4, $0xffff0f0e
+DATA shifts<>+0x68(SB)/4, $0xffffffff
+DATA shifts<>+0x6c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
+DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
+DATA shifts<>+0x78(SB)/4, $0xffffffff
+DATA shifts<>+0x7c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x80(SB)/4, $0x0b0a0908
+DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
+DATA shifts<>+0x88(SB)/4, $0xffffffff
+DATA shifts<>+0x8c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0x90(SB)/4, $0x0a090807
+DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
+DATA shifts<>+0x98(SB)/4, $0xffffff0f
+DATA shifts<>+0x9c(SB)/4, $0xffffffff
+	
+DATA shifts<>+0xa0(SB)/4, $0x09080706
+DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
+DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
+DATA shifts<>+0xac(SB)/4, $0xffffffff
+	
+DATA shifts<>+0xb0(SB)/4, $0x08070605
+DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
+DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
+DATA shifts<>+0xbc(SB)/4, $0xffffffff
+	
+DATA shifts<>+0xc0(SB)/4, $0x07060504
+DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
+DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
+DATA shifts<>+0xcc(SB)/4, $0xffffffff
+	
+DATA shifts<>+0xd0(SB)/4, $0x06050403
+DATA shifts<>+0xd4(SB)/4, $0x0a090807
+DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
+DATA shifts<>+0xdc(SB)/4, $0xffffff0f
+	
+DATA shifts<>+0xe0(SB)/4, $0x05040302
+DATA shifts<>+0xe4(SB)/4, $0x09080706
+DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
+DATA shifts<>+0xec(SB)/4, $0xffff0f0e
+	
+DATA shifts<>+0xf0(SB)/4, $0x04030201
+DATA shifts<>+0xf4(SB)/4, $0x08070605
+DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
+DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
+
+GLOBL shifts<>(SB),RODATA,$256
+
+TEXT runtime·memeq(SB),NOSPLIT,$0-13
+	MOVL	a+0(FP), SI
+	MOVL	b+4(FP), DI
+	MOVL	size+8(FP), BX
+	CALL	runtime·memeqbody(SB)
+	MOVB	AX, ret+12(FP)
+	RET
+
+// eqstring tests whether two strings are equal.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT,$0-17
+	MOVL	s1len+4(FP), AX
+	MOVL	s2len+12(FP), BX
+	CMPL	AX, BX
+	JNE	different
+	MOVL	s1str+0(FP), SI
+	MOVL	s2str+8(FP), DI
+	CMPL	SI, DI
+	JEQ	same
+	CALL	runtime·memeqbody(SB)
+	MOVB	AX, v+16(FP)
+	RET
+same:
+	MOVB	$1, v+16(FP)
+	RET
+different:
+	MOVB	$0, v+16(FP)
+	RET
+
+TEXT bytes·Equal(SB),NOSPLIT,$0-25
+	MOVL	a_len+4(FP), BX
+	MOVL	b_len+16(FP), CX
+	XORL	AX, AX
+	CMPL	BX, CX
+	JNE	eqret
+	MOVL	a+0(FP), SI
+	MOVL	b+12(FP), DI
+	CALL	runtime·memeqbody(SB)
+eqret:
+	MOVB	AX, ret+24(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
+	XORL	AX, AX
+
+	CMPL	BX, $4
+	JB	small
+
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPL	BX, $64
+	JB	bigloop
+	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
+	JE	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDL	$64, SI
+	ADDL	$64, DI
+	SUBL	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	RET
+
+	// 4 bytes at a time using 32-bit register
+bigloop:
+	CMPL	BX, $4
+	JBE	leftover
+	MOVL	(SI), CX
+	MOVL	(DI), DX
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BX
+	CMPL	CX, DX
+	JEQ	bigloop
+	RET
+
+	// remaining 0-4 bytes
+leftover:
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	-4(DI)(BX*1), DX
+	CMPL	CX, DX
+	SETEQ	AX
+	RET
+
+small:
+	CMPL	BX, $0
+	JEQ	equal
+
+	LEAL	0(BX*8), CX
+	NEGL	CX
+
+	MOVL	SI, DX
+	CMPB	DX, $0xfc
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVL	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
+	MOVL	-4(SI)(BX*1), SI
+	SHRL	CX, SI
+si_finish:
+
+	// same for DI.
+	MOVL	DI, DX
+	CMPB	DX, $0xfc
+	JA	di_high
+	MOVL	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVL	-4(DI)(BX*1), DI
+	SHRL	CX, DI
+di_finish:
+
+	SUBL	SI, DI
+	SHLL	CX, DI
+equal:
+	SETEQ	AX
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVL	s1_base+0(FP), SI
+	MOVL	s1_len+4(FP), BX
+	MOVL	s2_base+8(FP), DI
+	MOVL	s2_len+12(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·cmpbytes(SB),NOSPLIT,$0-28
+	MOVL	s1+0(FP), SI
+	MOVL	s1+4(FP), BX
+	MOVL	s2+12(FP), DI
+	MOVL	s2+16(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0
+	MOVL	s+0(FP), SI
+	MOVL	s_len+4(FP), CX
+	MOVB	c+12(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+16(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+16(FP)
+	RET
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0
+	MOVL	s+0(FP), SI
+	MOVL	s_len+4(FP), CX
+	MOVB	c+8(FP), AL
+	MOVL	SI, DI
+	CLD; REPN; SCASB
+	JZ 3(PC)
+	MOVL	$-1, ret+12(FP)
+	RET
+	SUBL	SI, DI
+	SUBL	$1, DI
+	MOVL	DI, ret+12(FP)
+	RET
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+// output:
+//   AX = 1/0/-1
+TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
+	CMPL	SI, DI
+	JEQ	cmp_allsame
+	CMPL	BX, DX
+	MOVL	DX, BP
+	CMOVLLT	BX, BP // BP = min(alen, blen)
+	CMPL	BP, $4
+	JB	cmp_small
+	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
+	JE	cmp_mediumloop
+cmp_largeloop:
+	CMPL	BP, $16
+	JB	cmp_mediumloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORL	$0xffff, AX	// convert EQ to NE
+	JNE	cmp_diff16	// branch if at least one byte is not equal
+	ADDL	$16, SI
+	ADDL	$16, DI
+	SUBL	$16, BP
+	JMP	cmp_largeloop
+
+cmp_diff16:
+	BSFL	AX, BX	// index of first byte that differs
+	XORL	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAL	-1(AX*2), AX	// convert 1/0 to +1/-1
+	RET
+
+cmp_mediumloop:
+	CMPL	BP, $4
+	JBE	cmp_0through4
+	MOVL	(SI), AX
+	MOVL	(DI), CX
+	CMPL	AX, CX
+	JNE	cmp_diff4
+	ADDL	$4, SI
+	ADDL	$4, DI
+	SUBL	$4, BP
+	JMP	cmp_mediumloop
+
+cmp_0through4:
+	MOVL	-4(SI)(BP*1), AX
+	MOVL	-4(DI)(BP*1), CX
+	CMPL	AX, CX
+	JEQ	cmp_allsame
+
+cmp_diff4:
+	BSWAPL	AX	// reverse order of bytes
+	BSWAPL	CX
+	XORL	AX, CX	// find bit differences
+	BSRL	CX, CX	// index of highest bit difference
+	SHRL	CX, AX	// move a's bit to bottom
+	ANDL	$1, AX	// mask bit
+	LEAL	-1(AX*2), AX // 1/0 => +1/-1
+	RET
+
+	// 0-3 bytes in common
+cmp_small:
+	LEAL	(BP*8), CX
+	NEGL	CX
+	JEQ	cmp_allsame
+
+	// load si
+	CMPB	SI, $0xfc
+	JA	cmp_si_high
+	MOVL	(SI), SI
+	JMP	cmp_si_finish
+cmp_si_high:
+	MOVL	-4(SI)(BP*1), SI
+	SHRL	CX, SI
+cmp_si_finish:
+	SHLL	CX, SI
+
+	// same for di
+	CMPB	DI, $0xfc
+	JA	cmp_di_high
+	MOVL	(DI), DI
+	JMP	cmp_di_finish
+cmp_di_high:
+	MOVL	-4(DI)(BP*1), DI
+	SHRL	CX, DI
+cmp_di_finish:
+	SHLL	CX, DI
+
+	BSWAPL	SI	// reverse order of bytes
+	BSWAPL	DI
+	XORL	SI, DI	// find bit differences
+	JEQ	cmp_allsame
+	BSRL	DI, CX	// index of highest bit difference
+	SHRL	CX, SI	// move a's bit to bottom
+	ANDL	$1, SI	// mask bit
+	LEAL	-1(SI*2), AX // 1/0 => +1/-1
+	RET
+
+	// all the bytes in common are the same, so we just need
+	// to compare the lengths.
+cmp_allsame:
+	XORL	AX, AX
+	XORL	CX, CX
+	CMPL	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAL	-1(CX)(AX*2), AX	// 1,0,-1 result
+	RET
+
+// A Duff's device for zeroing memory.
+// The compiler jumps to computed addresses within
+// this routine to zero chunks of memory.  Do not
+// change this code without also changing the code
+// in ../../cmd/8g/ggen.c:clearfat.
+// AX: zero
+// DI: ptr to memory to be zeroed
+// DI is updated as a side effect.
+TEXT runtime·duffzero(SB), NOSPLIT, $0-0
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	STOSL
+	RET
+
+// A Duff's device for copying memory.
+// The compiler jumps to computed addresses within
+// this routine to copy chunks of memory.  Source
+// and destination must not overlap.  Do not
+// change this code without also changing the code
+// in ../../cmd/6g/cgen.c:sgen.
+// SI: ptr to source memory
+// DI: ptr to destination memory
+// SI and DI are updated as a side effect.
+
+// NOTE: this is equivalent to a sequence of MOVSL but
+// for some reason MOVSL is really slow.
+TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	MOVL	(SI),CX
+	ADDL	$4,SI
+	MOVL	CX,(DI)
+	ADDL	$4,DI
+	
+	RET
+
+TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), AX
+	MOVL	m_fastrand(AX), DX
+	ADDL	DX, DX
+	MOVL	DX, BX
+	XORL	$0x88888eef, DX
+	CMOVLMI	BX, DX
+	MOVL	DX, m_fastrand(AX)
+	MOVL	DX, ret+0(FP)
+	RET
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVL	$0, AX
+	RET
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT,$0
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), AX
+	MOVL	m_curg(AX), AX
+	MOVL	(g_stack+stack_hi)(AX), AX
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT,$0-0
+	BYTE	$0x90	// NOP
+	CALL	runtime·goexit1(SB)	// does not return
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
new file mode 100644
index 000000000..39d7c78f2
--- /dev/null
+++ b/src/runtime/asm_amd64.s
@@ -0,0 +1,2237 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// copy arguments forward on an even stack
+	MOVQ	DI, AX		// argc
+	MOVQ	SI, BX		// argv
+	SUBQ	$(4*8+7), SP		// 2args 2auto
+	ANDQ	$~15, SP
+	MOVQ	AX, 16(SP)
+	MOVQ	BX, 24(SP)
+	
+	// create istack out of the given (operating system) stack.
+	// _cgo_init may update stackguard.
+	MOVQ	$runtime·g0(SB), DI
+	LEAQ	(-64*1024+104)(SP), BX
+	MOVQ	BX, g_stackguard0(DI)
+	MOVQ	BX, g_stackguard1(DI)
+	MOVQ	BX, (g_stack+stack_lo)(DI)
+	MOVQ	SP, (g_stack+stack_hi)(DI)
+
+	// find out information about the processor we're on
+	MOVQ	$0, AX
+	CPUID
+	CMPQ	AX, $0
+	JE	nocpuinfo
+	MOVQ	$1, AX
+	CPUID
+	MOVL	CX, runtime·cpuid_ecx(SB)
+	MOVL	DX, runtime·cpuid_edx(SB)
+nocpuinfo:	
+	
+	// if there is an _cgo_init, call it.
+	MOVQ	_cgo_init(SB), AX
+	TESTQ	AX, AX
+	JZ	needtls
+	// g0 already in DI
+	MOVQ	DI, CX	// Win64 uses CX for first parameter
+	MOVQ	$setg_gcc<>(SB), SI
+	CALL	AX
+
+	// update stackguard after _cgo_init
+	MOVQ	$runtime·g0(SB), CX
+	MOVQ	(g_stack+stack_lo)(CX), AX
+	ADDQ	$const_StackGuard, AX
+	MOVQ	AX, g_stackguard0(CX)
+	MOVQ	AX, g_stackguard1(CX)
+
+	CMPL	runtime·iswindows(SB), $0
+	JEQ ok
+needtls:
+	// skip TLS setup on Plan 9
+	CMPL	runtime·isplan9(SB), $1
+	JEQ ok
+	// skip TLS setup on Solaris
+	CMPL	runtime·issolaris(SB), $1
+	JEQ ok
+
+	LEAQ	runtime·tls0(SB), DI
+	CALL	runtime·settls(SB)
+
+	// store through it, to make sure it works
+	get_tls(BX)
+	MOVQ	$0x123, g(BX)
+	MOVQ	runtime·tls0(SB), AX
+	CMPQ	AX, $0x123
+	JEQ 2(PC)
+	MOVL	AX, 0	// abort
+ok:
+	// set the per-goroutine and per-mach "registers"
+	get_tls(BX)
+	LEAQ	runtime·g0(SB), CX
+	MOVQ	CX, g(BX)
+	LEAQ	runtime·m0(SB), AX
+
+	// save m->g0 = g0
+	MOVQ	CX, m_g0(AX)
+	// save m0 to g0->m
+	MOVQ	AX, g_m(CX)
+
+	CLD				// convention is D is always left cleared
+	CALL	runtime·check(SB)
+
+	MOVL	16(SP), AX		// copy argc
+	MOVL	AX, 0(SP)
+	MOVQ	24(SP), AX		// copy argv
+	MOVQ	AX, 8(SP)
+	CALL	runtime·args(SB)
+	CALL	runtime·osinit(SB)
+	CALL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVQ	$runtime·main·f(SB), BP		// entry
+	PUSHQ	BP
+	PUSHQ	$0			// arg size
+	CALL	runtime·newproc(SB)
+	POPQ	AX
+	POPQ	AX
+
+	// start this M
+	CALL	runtime·mstart(SB)
+
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+DATA	runtime·main·f+0(SB)/8,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),RODATA,$8
+
+TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
+	BYTE	$0xcc
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT,$0-0
+	// No per-thread init.
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT, $0-8
+	MOVQ	buf+0(FP), AX		// gobuf
+	LEAQ	buf+0(FP), BX		// caller's SP
+	MOVQ	BX, gobuf_sp(AX)
+	MOVQ	0(SP), BX		// caller's PC
+	MOVQ	BX, gobuf_pc(AX)
+	MOVQ	$0, gobuf_ret(AX)
+	MOVQ	$0, gobuf_ctxt(AX)
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	BX, gobuf_g(AX)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $0-8
+	MOVQ	buf+0(FP), BX		// gobuf
+	MOVQ	gobuf_g(BX), DX
+	MOVQ	0(DX), CX		// make sure g != nil
+	get_tls(CX)
+	MOVQ	DX, g(CX)
+	MOVQ	gobuf_sp(BX), SP	// restore SP
+	MOVQ	gobuf_ret(BX), AX
+	MOVQ	gobuf_ctxt(BX), DX
+	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
+	MOVQ	$0, gobuf_ret(BX)
+	MOVQ	$0, gobuf_ctxt(BX)
+	MOVQ	gobuf_pc(BX), BX
+	JMP	BX
+
+// func mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB), NOSPLIT, $0-8
+	MOVQ	fn+0(FP), DI
+	
+	get_tls(CX)
+	MOVQ	g(CX), AX	// save state in g->sched
+	MOVQ	0(SP), BX	// caller's PC
+	MOVQ	BX, (g_sched+gobuf_pc)(AX)
+	LEAQ	fn+0(FP), BX	// caller's SP
+	MOVQ	BX, (g_sched+gobuf_sp)(AX)
+	MOVQ	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to m->g0 & its stack, call fn
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_g0(BX), SI
+	CMPQ	SI, AX	// if g == m->g0 call badmcall
+	JNE	3(PC)
+	MOVQ	$runtime·badmcall(SB), AX
+	JMP	AX
+	MOVQ	SI, g(CX)	// g = m->g0
+	MOVQ	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
+	PUSHQ	AX
+	MOVQ	DI, DX
+	MOVQ	0(DI), DI
+	CALL	DI
+	POPQ	AX
+	MOVQ	$runtime·badmcall2(SB), AX
+	JMP	AX
+	RET
+
+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-0
+	RET
+
+// func onM_signalok(fn func())
+TEXT runtime·onM_signalok(SB), NOSPLIT, $0-8
+	get_tls(CX)
+	MOVQ	g(CX), AX	// AX = g
+	MOVQ	g_m(AX), BX	// BX = m
+	MOVQ	m_gsignal(BX), DX	// DX = gsignal
+	CMPQ	AX, DX
+	JEQ	ongsignal
+	JMP	runtime·onM(SB)
+
+ongsignal:
+	MOVQ	fn+0(FP), DI	// DI = fn
+	MOVQ	DI, DX
+	MOVQ	0(DI), DI
+	CALL	DI
+	RET
+
+// func onM(fn func())
+TEXT runtime·onM(SB), NOSPLIT, $0-8
+	MOVQ	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVQ	g(CX), AX	// AX = g
+	MOVQ	g_m(AX), BX	// BX = m
+
+	MOVQ	m_g0(BX), DX	// DX = g0
+	CMPQ	AX, DX
+	JEQ	onm
+
+	MOVQ	m_curg(BX), BP
+	CMPQ	AX, BP
+	JEQ	oncurg
+	
+	// Not g0, not curg. Must be gsignal, but that's not allowed.
+	// Hide call from linker nosplit analysis.
+	MOVQ	$runtime·badonm(SB), AX
+	CALL	AX
+
+oncurg:
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVQ	$runtime·switchtoM(SB), BP
+	MOVQ	BP, (g_sched+gobuf_pc)(AX)
+	MOVQ	SP, (g_sched+gobuf_sp)(AX)
+	MOVQ	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to g0
+	MOVQ	DX, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(DX), BX
+	// make it look like mstart called onM on g0, to stop traceback
+	SUBQ	$8, BX
+	MOVQ	$runtime·mstart(SB), DX
+	MOVQ	DX, 0(BX)
+	MOVQ	BX, SP
+
+	// call target function
+	MOVQ	DI, DX
+	MOVQ	0(DI), DI
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), BX
+	MOVQ	m_curg(BX), AX
+	MOVQ	AX, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(AX), SP
+	MOVQ	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+onm:
+	// already on m stack, just call directly
+	MOVQ	DI, DX
+	MOVQ	0(DI), DI
+	CALL	DI
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$0-0
+	// Cannot grow scheduler stack (m->g0).
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_g0(BX), SI
+	CMPQ	g(CX), SI
+	JNE	2(PC)
+	INT	$3
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVQ	m_gsignal(BX), SI
+	CMPQ	g(CX), SI
+	JNE	2(PC)
+	INT	$3
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVQ	8(SP), AX	// f's caller's PC
+	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
+	LEAQ	16(SP), AX	// f's caller's SP
+	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
+	get_tls(CX)
+	MOVQ	g(CX), SI
+	MOVQ	SI, (m_morebuf+gobuf_g)(BX)
+
+	// Set g->sched to context in f.
+	MOVQ	0(SP), AX // f's PC
+	MOVQ	AX, (g_sched+gobuf_pc)(SI)
+	MOVQ	SI, (g_sched+gobuf_g)(SI)
+	LEAQ	8(SP), AX // f's SP
+	MOVQ	AX, (g_sched+gobuf_sp)(SI)
+	MOVQ	DX, (g_sched+gobuf_ctxt)(SI)
+
+	// Call newstack on m->g0's stack.
+	MOVQ	m_g0(BX), BP
+	MOVQ	BP, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(BP), SP
+	CALL	runtime·newstack(SB)
+	MOVQ	$0, 0x1003	// crash if newstack returns
+	RET
+
+// morestack but not preserving ctxt.
+TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
+	MOVL	$0, DX
+	JMP	runtime·morestack(SB)
+
+// reflectcall: call a function with the given argument list
+// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	CMPQ	CX, $MAXSIZE;		\
+	JA	3(PC);			\
+	MOVQ	$NAME(SB), AX;		\
+	JMP	AX
+// Note: can't just "JMP NAME(SB)" - bad inlining results.
+
+TEXT ·reflectcall(SB), NOSPLIT, $0-24
+	MOVLQZX argsize+16(FP), CX
+	DISPATCH(runtime·call16, 16)
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVQ	$runtime·badreflectcall(SB), AX
+	JMP	AX
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVQ	argptr+8(FP), SI;		\
+	MOVLQZX argsize+16(FP), CX;		\
+	MOVQ	SP, DI;				\
+	REP;MOVSB;				\
+	/* call function */			\
+	MOVQ	f+0(FP), DX;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	CALL	(DX);				\
+	/* copy return values back */		\
+	MOVQ	argptr+8(FP), DI;		\
+	MOVLQZX	argsize+16(FP), CX;		\
+	MOVLQZX retoffset+20(FP), BX;		\
+	MOVQ	SP, SI;				\
+	ADDQ	BX, DI;				\
+	ADDQ	BX, SI;				\
+	SUBQ	BX, CX;				\
+	REP;MOVSB;				\
+	RET
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+// bool cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime·cas(SB), NOSPLIT, $0-17
+	MOVQ	ptr+0(FP), BX
+	MOVL	old+8(FP), AX
+	MOVL	new+12(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	JZ 4(PC)
+	MOVL	$0, AX
+	MOVB	AX, ret+16(FP)
+	RET
+	MOVL	$1, AX
+	MOVB	AX, ret+16(FP)
+	RET
+
+// bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime·cas64(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	JNZ	cas64_fail
+	MOVL	$1, AX
+	MOVB	AX, ret+24(FP)
+	RET
+cas64_fail:
+	MOVL	$0, AX
+	MOVB	AX, ret+24(FP)
+	RET
+	
+TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
+	JMP	runtime·cas64(SB)
+
+TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
+	JMP	runtime·atomicload64(SB)
+
+TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
+	JMP	runtime·atomicload64(SB)
+
+TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
+	JMP	runtime·atomicstore64(SB)
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime·casp(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	JZ 4(PC)
+	MOVL	$0, AX
+	MOVB	AX, ret+24(FP)
+	RET
+	MOVL	$1, AX
+	MOVB	AX, ret+24(FP)
+	RET
+
+// uint32 xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime·xadd(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	delta+8(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xadd64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xchg(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	new+8(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xchg64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xchgp(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
+	JMP	runtime·xchg64(SB)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	MOVL	cycles+0(FP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
+TEXT runtime·atomicstorep(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), BX
+	MOVL	val+8(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+// void	runtime·atomicor8(byte volatile*, byte);
+TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void jmpdefer(fn, sp);
+// called from deferreturn.
+// 1. pop the caller
+// 2. sub 5 bytes from the callers return
+// 3. jmp to the argument
+TEXT runtime·jmpdefer(SB), NOSPLIT, $0-16
+	MOVQ	fv+0(FP), DX	// fn
+	MOVQ	argp+8(FP), BX	// caller sp
+	LEAQ	-8(BX), SP	// caller sp after CALL
+	SUBQ	$5, (SP)	// return to CALL again
+	MOVQ	0(DX), BX
+	JMP	BX	// but first run the deferred function
+
+// Save state of caller into g->sched. Smashes R8, R9.
+TEXT gosave<>(SB),NOSPLIT,$0
+	get_tls(R8)
+	MOVQ	g(R8), R8
+	MOVQ	0(SP), R9
+	MOVQ	R9, (g_sched+gobuf_pc)(R8)
+	LEAQ	8(SP), R9
+	MOVQ	R9, (g_sched+gobuf_sp)(R8)
+	MOVQ	$0, (g_sched+gobuf_ret)(R8)
+	MOVQ	$0, (g_sched+gobuf_ctxt)(R8)
+	RET
+
+// asmcgocall(void(*fn)(void*), void *arg)
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.c for more details.
+TEXT ·asmcgocall(SB),NOSPLIT,$0-16
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg+8(FP), BX
+	CALL	asmcgocall<>(SB)
+	RET
+
+TEXT ·asmcgocall_errno(SB),NOSPLIT,$0-20
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg+8(FP), BX
+	CALL	asmcgocall<>(SB)
+	MOVL	AX, ret+16(FP)
+	RET
+
+// asmcgocall common code. fn in AX, arg in BX. returns errno in AX.
+TEXT asmcgocall<>(SB),NOSPLIT,$0-0
+	MOVQ	SP, DX
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	get_tls(CX)
+	MOVQ	g(CX), BP
+	MOVQ	g_m(BP), BP
+	MOVQ	m_g0(BP), SI
+	MOVQ	g(CX), DI
+	CMPQ	SI, DI
+	JEQ	nosave
+	MOVQ	m_gsignal(BP), SI
+	CMPQ	SI, DI
+	JEQ	nosave
+	
+	MOVQ	m_g0(BP), SI
+	CALL	gosave<>(SB)
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), SP
+nosave:
+
+	// Now on a scheduling stack (a pthread-created stack).
+	// Make sure we have enough room for 4 stack-backed fast-call
+	// registers as per windows amd64 calling convention.
+	SUBQ	$64, SP
+	ANDQ	$~15, SP	// alignment for gcc ABI
+	MOVQ	DI, 48(SP)	// save g
+	MOVQ	(g_stack+stack_hi)(DI), DI
+	SUBQ	DX, DI
+	MOVQ	DI, 40(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
+	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	MOVQ	BX, CX		// CX = first argument in Win64
+	CALL	AX
+
+	// Restore registers, g, stack pointer.
+	get_tls(CX)
+	MOVQ	48(SP), DI
+	MOVQ	(g_stack+stack_hi)(DI), SI
+	SUBQ	40(SP), SI
+	MOVQ	DI, g(CX)
+	MOVQ	SI, SP
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
+	LEAQ	fn+0(FP), AX
+	MOVQ	AX, 0(SP)
+	MOVQ	frame+8(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	framesize+16(FP), AX
+	MOVQ	AX, 16(SP)
+	MOVQ	$runtime·cgocallback_gofunc(SB), AX
+	CALL	AX
+	RET
+
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
+	NO_LOCAL_POINTERS
+
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one m for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call through AX.
+	get_tls(CX)
+#ifdef GOOS_windows
+	MOVL	$0, BP
+	CMPQ	CX, $0
+	JEQ	2(PC)
+#endif
+	MOVQ	g(CX), BP
+	CMPQ	BP, $0
+	JEQ	needm
+	MOVQ	g_m(BP), BP
+	MOVQ	BP, R8 // holds oldm until end of function
+	JMP	havem
+needm:
+	MOVQ	$0, 0(SP)
+	MOVQ	$runtime·needm(SB), AX
+	CALL	AX
+	MOVQ	0(SP), R8
+	get_tls(CX)
+	MOVQ	g(CX), BP
+	MOVQ	g_m(BP), BP
+	
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then onM will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVQ	m_g0(BP), SI
+	MOVQ	SP, (g_sched+gobuf_sp)(SI)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
+	MOVQ	m_g0(BP), SI
+	MOVQ	(g_sched+gobuf_sp)(SI), AX
+	MOVQ	AX, 0(SP)
+	MOVQ	SP, (g_sched+gobuf_sp)(SI)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	//
+	// In the new goroutine, 0(SP) holds the saved R8.
+	MOVQ	m_curg(BP), SI
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
+	MOVQ	(g_sched+gobuf_pc)(SI), BP
+	MOVQ	BP, -8(DI)
+	LEAQ	-(8+8)(DI), SP
+	MOVQ	R8, 0(SP)
+	CALL	runtime·cgocallbackg(SB)
+	MOVQ	0(SP), R8
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	get_tls(CX)
+	MOVQ	g(CX), SI
+	MOVQ	8(SP), BP
+	MOVQ	BP, (g_sched+gobuf_pc)(SI)
+	LEAQ	(8+8)(SP), DI
+	MOVQ	DI, (g_sched+gobuf_sp)(SI)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVQ	g(CX), BP
+	MOVQ	g_m(BP), BP
+	MOVQ	m_g0(BP), SI
+	MOVQ	SI, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(SI), SP
+	MOVQ	0(SP), AX
+	MOVQ	AX, (g_sched+gobuf_sp)(SI)
+	
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	CMPQ	R8, $0
+	JNE 3(PC)
+	MOVQ	$runtime·dropm(SB), AX
+	CALL	AX
+
+	// Done!
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVQ	gg+0(FP), BX
+#ifdef GOOS_windows
+	CMPQ	BX, $0
+	JNE	settls
+	MOVQ	$0, 0x28(GS)
+	RET
+settls:
+	MOVQ	g_m(BX), AX
+	LEAQ	m_tls(AX), AX
+	MOVQ	AX, 0x28(GS)
+#endif
+	get_tls(CX)
+	MOVQ	BX, g(CX)
+	RET
+
+// void setg_gcc(G*); set g called from gcc.
+TEXT setg_gcc<>(SB),NOSPLIT,$0
+	get_tls(AX)
+	MOVQ	DI, g(AX)
+	RET
+
+// check that SP is in range [g->stack.lo, g->stack.hi)
+TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	CMPQ	(g_stack+stack_hi)(AX), SP
+	JHI	2(PC)
+	INT	$3
+	CMPQ	SP, (g_stack+stack_lo)(AX)
+	JHI	2(PC)
+	INT	$3
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$0-16
+	MOVQ	argp+0(FP),AX		// addr of first arg
+	MOVQ	-8(AX),AX		// get calling pc
+	MOVQ	AX, ret+8(FP)
+	RET
+
+TEXT runtime·gogetcallerpc(SB),NOSPLIT,$0-16
+	MOVQ	p+0(FP),AX		// addr of first arg
+	MOVQ	-8(AX),AX		// get calling pc
+	MOVQ	AX,ret+8(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$0-16
+	MOVQ	argp+0(FP),AX		// addr of first arg
+	MOVQ	pc+8(FP), BX
+	MOVQ	BX, -8(AX)		// set calling pc
+	RET
+
+TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
+	MOVQ	argp+0(FP), AX
+	MOVQ	AX, ret+8(FP)
+	RET
+
+// func gogetcallersp(p unsafe.Pointer) uintptr
+TEXT runtime·gogetcallersp(SB),NOSPLIT,$0-16
+	MOVQ	p+0(FP),AX		// addr of first arg
+	MOVQ	AX, ret+8(FP)
+	RET
+
+// int64 runtime·cputicks(void)
+TEXT runtime·cputicks(SB),NOSPLIT,$0-0
+	RDTSC
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+// hash function using AES hardware instructions
+TEXT runtime·aeshash(SB),NOSPLIT,$0-32
+	MOVQ	p+0(FP), AX	// ptr to data
+	MOVQ	s+8(FP), CX	// size
+	JMP	runtime·aeshashbody(SB)
+
+TEXT runtime·aeshashstr(SB),NOSPLIT,$0-32
+	MOVQ	p+0(FP), AX	// ptr to string struct
+	// s+8(FP) is ignored, it is always sizeof(String)
+	MOVQ	8(AX), CX	// length of string
+	MOVQ	(AX), AX	// string data
+	JMP	runtime·aeshashbody(SB)
+
+// AX: data
+// CX: length
+TEXT runtime·aeshashbody(SB),NOSPLIT,$0-32
+	MOVQ	h+16(FP), X0	// seed to low 64 bits of xmm0
+	PINSRQ	$1, CX, X0	// size to high 64 bits of xmm0
+	MOVO	runtime·aeskeysched+0(SB), X2
+	MOVO	runtime·aeskeysched+16(SB), X3
+	CMPQ	CX, $16
+	JB	aessmall
+aesloop:
+	CMPQ	CX, $16
+	JBE	aesloopend
+	MOVOU	(AX), X1
+	AESENC	X2, X0
+	AESENC	X1, X0
+	SUBQ	$16, CX
+	ADDQ	$16, AX
+	JMP	aesloop
+// 1-16 bytes remaining
+aesloopend:
+	// This load may overlap with the previous load above.
+	// We'll hash some bytes twice, but that's ok.
+	MOVOU	-16(AX)(CX*1), X1
+	JMP	partial
+// 0-15 bytes
+aessmall:
+	TESTQ	CX, CX
+	JE	finalize	// 0 bytes
+
+	CMPB	AX, $0xf0
+	JA	highpartial
+
+	// 16 bytes loaded at this address won't cross
+	// a page boundary, so we can load it directly.
+	MOVOU	(AX), X1
+	ADDQ	CX, CX
+	MOVQ	$masks<>(SB), BP
+	PAND	(BP)(CX*8), X1
+	JMP	partial
+highpartial:
+	// address ends in 1111xxxx.  Might be up against
+	// a page boundary, so load ending at last byte.
+	// Then shift bytes down using pshufb.
+	MOVOU	-16(AX)(CX*1), X1
+	ADDQ	CX, CX
+	MOVQ	$shifts<>(SB), BP
+	PSHUFB	(BP)(CX*8), X1
+partial:
+	// incorporate partial block into hash
+	AESENC	X3, X0
+	AESENC	X1, X0
+finalize:	
+	// finalize hash
+	AESENC	X2, X0
+	AESENC	X3, X0
+	AESENC	X2, X0
+	MOVQ	X0, res+24(FP)
+	RET
+
+TEXT runtime·aeshash32(SB),NOSPLIT,$0-32
+	MOVQ	p+0(FP), AX	// ptr to data
+	// s+8(FP) is ignored, it is always sizeof(int32)
+	MOVQ	h+16(FP), X0	// seed
+	PINSRD	$2, (AX), X0	// data
+	AESENC	runtime·aeskeysched+0(SB), X0
+	AESENC	runtime·aeskeysched+16(SB), X0
+	AESENC	runtime·aeskeysched+0(SB), X0
+	MOVQ	X0, ret+24(FP)
+	RET
+
+TEXT runtime·aeshash64(SB),NOSPLIT,$0-32
+	MOVQ	p+0(FP), AX	// ptr to data
+	// s+8(FP) is ignored, it is always sizeof(int64)
+	MOVQ	h+16(FP), X0	// seed
+	PINSRQ	$1, (AX), X0	// data
+	AESENC	runtime·aeskeysched+0(SB), X0
+	AESENC	runtime·aeskeysched+16(SB), X0
+	AESENC	runtime·aeskeysched+0(SB), X0
+	MOVQ	X0, ret+24(FP)
+	RET
+
+// simple mask to get rid of data in the high part of the register.
+DATA masks<>+0x00(SB)/8, $0x0000000000000000
+DATA masks<>+0x08(SB)/8, $0x0000000000000000
+DATA masks<>+0x10(SB)/8, $0x00000000000000ff
+DATA masks<>+0x18(SB)/8, $0x0000000000000000
+DATA masks<>+0x20(SB)/8, $0x000000000000ffff
+DATA masks<>+0x28(SB)/8, $0x0000000000000000
+DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
+DATA masks<>+0x38(SB)/8, $0x0000000000000000
+DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
+DATA masks<>+0x48(SB)/8, $0x0000000000000000
+DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
+DATA masks<>+0x58(SB)/8, $0x0000000000000000
+DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
+DATA masks<>+0x68(SB)/8, $0x0000000000000000
+DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
+DATA masks<>+0x78(SB)/8, $0x0000000000000000
+DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
+DATA masks<>+0x88(SB)/8, $0x0000000000000000
+DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
+DATA masks<>+0x98(SB)/8, $0x00000000000000ff
+DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
+DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
+DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
+DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
+DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
+DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
+DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
+GLOBL masks<>(SB),RODATA,$256
+
+// these are arguments to pshufb.  They move data down from
+// the high bytes of the register to the low bytes of the register.
+// index is how many bytes to move.
+DATA shifts<>+0x00(SB)/8, $0x0000000000000000
+DATA shifts<>+0x08(SB)/8, $0x0000000000000000
+DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
+DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
+DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
+DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
+DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
+DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
+DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
+DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
+DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
+DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
+DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
+DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
+DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
+DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
+DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
+DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
+DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
+DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
+DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
+DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
+DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
+DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
+DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
+GLOBL shifts<>(SB),RODATA,$256
+
+TEXT runtime·memeq(SB),NOSPLIT,$0-25
+	MOVQ	a+0(FP), SI
+	MOVQ	b+8(FP), DI
+	MOVQ	size+16(FP), BX
+	CALL	runtime·memeqbody(SB)
+	MOVB	AX, ret+24(FP)
+	RET
+
+// eqstring tests whether two strings are equal.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT,$0-33
+	MOVQ	s1len+8(FP), AX
+	MOVQ	s2len+24(FP), BX
+	CMPQ	AX, BX
+	JNE	different
+	MOVQ	s1str+0(FP), SI
+	MOVQ	s2str+16(FP), DI
+	CMPQ	SI, DI
+	JEQ	same
+	CALL	runtime·memeqbody(SB)
+	MOVB	AX, v+32(FP)
+	RET
+same:
+	MOVB	$1, v+32(FP)
+	RET
+different:
+	MOVB	$0, v+32(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
+	XORQ	AX, AX
+
+	CMPQ	BX, $8
+	JB	small
+	
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPQ	BX, $64
+	JB	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	RET
+
+	// 8 bytes at a time using 64-bit register
+bigloop:
+	CMPQ	BX, $8
+	JBE	leftover
+	MOVQ	(SI), CX
+	MOVQ	(DI), DX
+	ADDQ	$8, SI
+	ADDQ	$8, DI
+	SUBQ	$8, BX
+	CMPQ	CX, DX
+	JEQ	bigloop
+	RET
+
+	// remaining 0-8 bytes
+leftover:
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	-8(DI)(BX*1), DX
+	CMPQ	CX, DX
+	SETEQ	AX
+	RET
+
+small:
+	CMPQ	BX, $0
+	JEQ	equal
+
+	LEAQ	0(BX*8), CX
+	NEGQ	CX
+
+	CMPB	SI, $0xf8
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
+	MOVQ	-8(SI)(BX*1), SI
+	SHRQ	CX, SI
+si_finish:
+
+	// same for DI.
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	-8(DI)(BX*1), DI
+	SHRQ	CX, DI
+di_finish:
+
+	SUBQ	SI, DI
+	SHLQ	CX, DI
+equal:
+	SETEQ	AX
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+	MOVQ	s1_base+0(FP), SI
+	MOVQ	s1_len+8(FP), BX
+	MOVQ	s2_base+16(FP), DI
+	MOVQ	s2_len+24(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·cmpbytes(SB),NOSPLIT,$0-56
+	MOVQ	s1+0(FP), SI
+	MOVQ	s1+8(FP), BX
+	MOVQ	s2+24(FP), DI
+	MOVQ	s2+32(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVQ	AX, res+48(FP)
+	RET
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+// output:
+//   AX = 1/0/-1
+TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
+	CMPQ	SI, DI
+	JEQ	cmp_allsame
+	CMPQ	BX, DX
+	MOVQ	DX, BP
+	CMOVQLT	BX, BP // BP = min(alen, blen) = # of bytes to compare
+	CMPQ	BP, $8
+	JB	cmp_small
+
+cmp_loop:
+	CMPQ	BP, $16
+	JBE	cmp_0through16
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX	// convert EQ to NE
+	JNE	cmp_diff16	// branch if at least one byte is not equal
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	SUBQ	$16, BP
+	JMP	cmp_loop
+	
+	// AX = bit mask of differences
+cmp_diff16:
+	BSFQ	AX, BX	// index of first byte that differs
+	XORQ	AX, AX
+	MOVB	(SI)(BX*1), CX
+	CMPB	CX, (DI)(BX*1)
+	SETHI	AX
+	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
+	RET
+
+	// 0 through 16 bytes left, alen>=8, blen>=8
+cmp_0through16:
+	CMPQ	BP, $8
+	JBE	cmp_0through8
+	MOVQ	(SI), AX
+	MOVQ	(DI), CX
+	CMPQ	AX, CX
+	JNE	cmp_diff8
+cmp_0through8:
+	MOVQ	-8(SI)(BP*1), AX
+	MOVQ	-8(DI)(BP*1), CX
+	CMPQ	AX, CX
+	JEQ	cmp_allsame
+
+	// AX and CX contain parts of a and b that differ.
+cmp_diff8:
+	BSWAPQ	AX	// reverse order of bytes
+	BSWAPQ	CX
+	XORQ	AX, CX
+	BSRQ	CX, CX	// index of highest bit difference
+	SHRQ	CX, AX	// move a's bit to bottom
+	ANDQ	$1, AX	// mask bit
+	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
+	RET
+
+	// 0-7 bytes in common
+cmp_small:
+	LEAQ	(BP*8), CX	// bytes left -> bits left
+	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
+	JEQ	cmp_allsame
+
+	// load bytes of a into high bytes of AX
+	CMPB	SI, $0xf8
+	JA	cmp_si_high
+	MOVQ	(SI), SI
+	JMP	cmp_si_finish
+cmp_si_high:
+	MOVQ	-8(SI)(BP*1), SI
+	SHRQ	CX, SI
+cmp_si_finish:
+	SHLQ	CX, SI
+
+	// load bytes of b in to high bytes of BX
+	CMPB	DI, $0xf8
+	JA	cmp_di_high
+	MOVQ	(DI), DI
+	JMP	cmp_di_finish
+cmp_di_high:
+	MOVQ	-8(DI)(BP*1), DI
+	SHRQ	CX, DI
+cmp_di_finish:
+	SHLQ	CX, DI
+
+	BSWAPQ	SI	// reverse order of bytes
+	BSWAPQ	DI
+	XORQ	SI, DI	// find bit differences
+	JEQ	cmp_allsame
+	BSRQ	DI, CX	// index of highest bit difference
+	SHRQ	CX, SI	// move a's bit to bottom
+	ANDQ	$1, SI	// mask bit
+	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
+	RET
+
+cmp_allsame:
+	XORQ	AX, AX
+	XORQ	CX, CX
+	CMPQ	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0
+	MOVQ s+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+24(FP), AL
+	CALL runtime·indexbytebody(SB)
+	MOVQ AX, ret+32(FP)
+	RET
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0
+	MOVQ s+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+16(FP), AL
+	CALL runtime·indexbytebody(SB)
+	MOVQ AX, ret+24(FP)
+	RET
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+// output:
+//   AX
+TEXT runtime·indexbytebody(SB),NOSPLIT,$0
+	MOVQ SI, DI
+
+	CMPQ BX, $16
+	JLT indexbyte_small
+
+	// round up to first 16-byte boundary
+	TESTQ $15, SI
+	JZ aligned
+	MOVQ SI, CX
+	ANDQ $~15, CX
+	ADDQ $16, CX
+
+	// search the beginning
+	SUBQ SI, CX
+	REPN; SCASB
+	JZ success
+
+// DI is 16-byte aligned; get ready to search using SSE instructions
+aligned:
+	// round down to last 16-byte boundary
+	MOVQ BX, R11
+	ADDQ SI, R11
+	ANDQ $~15, R11
+
+	// shuffle X0 around so that each byte contains c
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+	JMP condition
+
+sse:
+	// move the next 16-byte chunk of the buffer into X1
+	MOVO (DI), X1
+	// compare bytes in X0 to X1
+	PCMPEQB X0, X1
+	// take the top bit of each byte in X1 and put the result in DX
+	PMOVMSKB X1, DX
+	TESTL DX, DX
+	JNZ ssesuccess
+	ADDQ $16, DI
+
+condition:
+	CMPQ DI, R11
+	JLT sse
+
+	// search the end
+	MOVQ SI, CX
+	ADDQ BX, CX
+	SUBQ R11, CX
+	// if CX == 0, the zero flag will be set and we'll end up
+	// returning a false success
+	JZ failure
+	REPN; SCASB
+	JZ success
+
+failure:
+	MOVQ $-1, AX
+	RET
+
+// handle for lengths < 16
+indexbyte_small:
+	MOVQ BX, CX
+	REPN; SCASB
+	JZ success
+	MOVQ $-1, AX
+	RET
+
+// we've found the chunk containing the byte
+// now just figure out which specific byte it is
+ssesuccess:
+	// get the index of the least significant set bit
+	BSFW DX, DX
+	SUBQ SI, DI
+	ADDQ DI, DX
+	MOVQ DX, AX
+	RET
+
+success:
+	SUBQ SI, DI
+	SUBL $1, DI
+	MOVQ DI, AX
+	RET
+
+TEXT bytes·Equal(SB),NOSPLIT,$0-49
+	MOVQ	a_len+8(FP), BX
+	MOVQ	b_len+32(FP), CX
+	XORQ	AX, AX
+	CMPQ	BX, CX
+	JNE	eqret
+	MOVQ	a+0(FP), SI
+	MOVQ	b+24(FP), DI
+	CALL	runtime·memeqbody(SB)
+eqret:
+	MOVB	AX, ret+48(FP)
+	RET
+
+// A Duff's device for zeroing memory.
+// The compiler jumps to computed addresses within
+// this routine to zero chunks of memory.  Do not
+// change this code without also changing the code
+// in ../../cmd/6g/ggen.c:clearfat.
+// AX: zero
+// DI: ptr to memory to be zeroed
+// DI is updated as a side effect.
+TEXT runtime·duffzero(SB), NOSPLIT, $0-0
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	STOSQ
+	RET
+
+// A Duff's device for copying memory.
+// The compiler jumps to computed addresses within
+// this routine to copy chunks of memory.  Source
+// and destination must not overlap.  Do not
+// change this code without also changing the code
+// in ../../cmd/6g/cgen.c:sgen.
+// SI: ptr to source memory
+// DI: ptr to destination memory
+// SI and DI are updated as a side effect.
+
+// NOTE: this is equivalent to a sequence of MOVSQ but
+// for some reason that is 3.5x slower than this code.
+// The STOSQ above seem fine, though.
+TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	MOVQ	(SI),CX
+	ADDQ	$8,SI
+	MOVQ	CX,(DI)
+	ADDQ	$8,DI
+
+	RET
+
+TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), AX
+	MOVL	m_fastrand(AX), DX
+	ADDL	DX, DX
+	MOVL	DX, BX
+	XORL	$0x88888eef, DX
+	CMOVLMI	BX, DX
+	MOVL	DX, m_fastrand(AX)
+	MOVL	DX, ret+0(FP)
+	RET
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVL	$0, AX
+	RET
+
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT,$0
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), AX
+	MOVQ	m_curg(AX), AX
+	MOVQ	(g_stack+stack_hi)(AX), AX
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT,$0-0
+	BYTE	$0x90	// NOP
+	CALL	runtime·goexit1(SB)	// does not return
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
new file mode 100644
index 000000000..a1116b5d4
--- /dev/null
+++ b/src/runtime/asm_amd64p32.s
@@ -0,0 +1,1087 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// copy arguments forward on an even stack
+	MOVL	argc+0(FP), AX
+	MOVL	argv+4(FP), BX
+	MOVL	SP, CX
+	SUBL	$128, SP		// plenty of scratch
+	ANDL	$~15, CX
+	MOVL	CX, SP
+
+	MOVL	AX, 16(SP)
+	MOVL	BX, 24(SP)
+	
+	// create istack out of the given (operating system) stack.
+	MOVL	$runtime·g0(SB), DI
+	LEAL	(-64*1024+104)(SP), BX
+	MOVL	BX, g_stackguard0(DI)
+	MOVL	BX, g_stackguard1(DI)
+	MOVL	BX, (g_stack+stack_lo)(DI)
+	MOVL	SP, (g_stack+stack_hi)(DI)
+
+	// find out information about the processor we're on
+	MOVQ	$0, AX
+	CPUID
+	CMPQ	AX, $0
+	JE	nocpuinfo
+	MOVQ	$1, AX
+	CPUID
+	MOVL	CX, runtime·cpuid_ecx(SB)
+	MOVL	DX, runtime·cpuid_edx(SB)
+nocpuinfo:	
+	
+needtls:
+	LEAL	runtime·tls0(SB), DI
+	CALL	runtime·settls(SB)
+
+	// store through it, to make sure it works
+	get_tls(BX)
+	MOVQ	$0x123, g(BX)
+	MOVQ	runtime·tls0(SB), AX
+	CMPQ	AX, $0x123
+	JEQ 2(PC)
+	MOVL	AX, 0	// abort
+ok:
+	// set the per-goroutine and per-mach "registers"
+	get_tls(BX)
+	LEAL	runtime·g0(SB), CX
+	MOVL	CX, g(BX)
+	LEAL	runtime·m0(SB), AX
+
+	// save m->g0 = g0
+	MOVL	CX, m_g0(AX)
+	// save m0 to g0->m
+	MOVL	AX, g_m(CX)
+
+	CLD				// convention is D is always left cleared
+	CALL	runtime·check(SB)
+
+	MOVL	16(SP), AX		// copy argc
+	MOVL	AX, 0(SP)
+	MOVL	24(SP), AX		// copy argv
+	MOVL	AX, 4(SP)
+	CALL	runtime·args(SB)
+	CALL	runtime·osinit(SB)
+	CALL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVL	$runtime·main·f(SB), AX	// entry
+	MOVL	$0, 0(SP)
+	MOVL	AX, 4(SP)
+	CALL	runtime·newproc(SB)
+
+	// start this M
+	CALL	runtime·mstart(SB)
+
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+DATA	runtime·main·f+0(SB)/4,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),RODATA,$4
+
+TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
+	INT $3
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT,$0-0
+	// No per-thread init.
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT, $0-4
+	MOVL	buf+0(FP), AX	// gobuf
+	LEAL	buf+0(FP), BX	// caller's SP
+	MOVL	BX, gobuf_sp(AX)
+	MOVL	0(SP), BX		// caller's PC
+	MOVL	BX, gobuf_pc(AX)
+	MOVL	$0, gobuf_ctxt(AX)
+	MOVQ	$0, gobuf_ret(AX)
+	get_tls(CX)
+	MOVL	g(CX), BX
+	MOVL	BX, gobuf_g(AX)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $0-4
+	MOVL	buf+0(FP), BX		// gobuf
+	MOVL	gobuf_g(BX), DX
+	MOVL	0(DX), CX		// make sure g != nil
+	get_tls(CX)
+	MOVL	DX, g(CX)
+	MOVL	gobuf_sp(BX), SP	// restore SP
+	MOVL	gobuf_ctxt(BX), DX
+	MOVQ	gobuf_ret(BX), AX
+	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
+	MOVQ	$0, gobuf_ret(BX)
+	MOVL	$0, gobuf_ctxt(BX)
+	MOVL	gobuf_pc(BX), BX
+	JMP	BX
+
+// func mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB), NOSPLIT, $0-4
+	MOVL	fn+0(FP), DI
+	
+	get_tls(CX)
+	MOVL	g(CX), AX	// save state in g->sched
+	MOVL	0(SP), BX	// caller's PC
+	MOVL	BX, (g_sched+gobuf_pc)(AX)
+	LEAL	fn+0(FP), BX	// caller's SP
+	MOVL	BX, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to m->g0 & its stack, call fn
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
+	MOVL	m_g0(BX), SI
+	CMPL	SI, AX	// if g == m->g0 call badmcall
+	JNE	3(PC)
+	MOVL	$runtime·badmcall(SB), AX
+	JMP	AX
+	MOVL	SI, g(CX)	// g = m->g0
+	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
+	PUSHQ	AX
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+	POPQ	AX
+	MOVL	$runtime·badmcall2(SB), AX
+	JMP	AX
+	RET
+
+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-0
+	RET
+
+// func onM_signalok(fn func())
+TEXT runtime·onM_signalok(SB), NOSPLIT, $0-4
+	get_tls(CX)
+	MOVL	g(CX), AX	// AX = g
+	MOVL	g_m(AX), BX	// BX = m
+	MOVL	m_gsignal(BX), DX	// DX = gsignal
+	CMPL	AX, DX
+	JEQ	ongsignal
+	JMP	runtime·onM(SB)
+
+ongsignal:
+	MOVL	fn+0(FP), DI	// DI = fn
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+	RET
+
+// func onM(fn func())
+TEXT runtime·onM(SB), NOSPLIT, $0-4
+	MOVL	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVL	g(CX), AX	// AX = g
+	MOVL	g_m(AX), BX	// BX = m
+
+	MOVL	m_g0(BX), DX	// DX = g0
+	CMPL	AX, DX
+	JEQ	onm
+
+	MOVL	m_curg(BX), R8
+	CMPL	AX, R8
+	JEQ	oncurg
+	
+	// Not g0, not curg. Must be gsignal, but that's not allowed.
+	// Hide call from linker nosplit analysis.
+	MOVL	$runtime·badonm(SB), AX
+	CALL	AX
+
+oncurg:
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVL	$runtime·switchtoM(SB), SI
+	MOVL	SI, (g_sched+gobuf_pc)(AX)
+	MOVL	SP, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to g0
+	MOVL	DX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(DX), SP
+
+	// call target function
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), BX
+	MOVL	m_curg(BX), AX
+	MOVL	AX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(AX), SP
+	MOVL	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+onm:
+	// already on m stack, just call directly
+	MOVL	DI, DX
+	MOVL	0(DI), DI
+	CALL	DI
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$0-0
+	get_tls(CX)
+	MOVL	g(CX), BX
+	MOVL	g_m(BX), BX
+
+	// Cannot grow scheduler stack (m->g0).
+	MOVL	m_g0(BX), SI
+	CMPL	g(CX), SI
+	JNE	2(PC)
+	MOVL	0, AX
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVL	m_gsignal(BX), SI
+	CMPL	g(CX), SI
+	JNE	2(PC)
+	MOVL	0, AX
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVL	8(SP), AX	// f's caller's PC
+	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
+	LEAL	16(SP), AX	// f's caller's SP
+	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
+	get_tls(CX)
+	MOVL	g(CX), SI
+	MOVL	SI, (m_morebuf+gobuf_g)(BX)
+
+	// Set g->sched to context in f.
+	MOVL	0(SP), AX // f's PC
+	MOVL	AX, (g_sched+gobuf_pc)(SI)
+	MOVL	SI, (g_sched+gobuf_g)(SI)
+	LEAL	8(SP), AX // f's SP
+	MOVL	AX, (g_sched+gobuf_sp)(SI)
+	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
+
+	// Call newstack on m->g0's stack.
+	MOVL	m_g0(BX), BX
+	MOVL	BX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(BX), SP
+	CALL	runtime·newstack(SB)
+	MOVL	$0, 0x1003	// crash if newstack returns
+	RET
+
+// morestack trampolines
+TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
+	MOVL	$0, DX
+	JMP	runtime·morestack(SB)
+
+// reflectcall: call a function with the given argument list
+// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	CMPL	CX, $MAXSIZE;		\
+	JA	3(PC);			\
+	MOVL	$NAME(SB), AX;		\
+	JMP	AX
+// Note: can't just "JMP NAME(SB)" - bad inlining results.
+
+TEXT ·reflectcall(SB), NOSPLIT, $0-16
+	MOVLQZX argsize+8(FP), CX
+	DISPATCH(runtime·call16, 16)
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVL	$runtime·badreflectcall(SB), AX
+	JMP	AX
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-16;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVL	argptr+4(FP), SI;		\
+	MOVL	argsize+8(FP), CX;		\
+	MOVL	SP, DI;				\
+	REP;MOVSB;				\
+	/* call function */			\
+	MOVL	f+0(FP), DX;			\
+	MOVL	(DX), AX;			\
+	CALL	AX;				\
+	/* copy return values back */		\
+	MOVL	argptr+4(FP), DI;		\
+	MOVL	argsize+8(FP), CX;		\
+	MOVL	retoffset+12(FP), BX;		\
+	MOVL	SP, SI;				\
+	ADDL	BX, DI;				\
+	ADDL	BX, SI;				\
+	SUBL	BX, CX;				\
+	REP;MOVSB;				\
+	RET
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+// bool cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime·cas(SB), NOSPLIT, $0-17
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	JZ 4(PC)
+	MOVL	$0, AX
+	MOVB	AX, ret+16(FP)
+	RET
+	MOVL	$1, AX
+	MOVB	AX, ret+16(FP)
+	RET
+
+TEXT runtime·casuintptr(SB), NOSPLIT, $0-17
+	JMP	runtime·cas(SB)
+
+TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-12
+	JMP	runtime·atomicload(SB)
+
+TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-12
+	JMP	runtime·atomicload(SB)
+
+TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-12
+	JMP	runtime·atomicstore(SB)
+
+// bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime·cas64(SB), NOSPLIT, $0-25
+	MOVL	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	JNZ	cas64_fail
+	MOVL	$1, AX
+	MOVB	AX, ret+24(FP)
+	RET
+cas64_fail:
+	MOVL	$0, AX
+	MOVB	AX, ret+24(FP)
+	RET
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime·casp(SB), NOSPLIT, $0-17
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	JZ 4(PC)
+	MOVL	$0, AX
+	MOVB	AX, ret+16(FP)
+	RET
+	MOVL	$1, AX
+	MOVB	AX, ret+16(FP)
+	RET
+
+// uint32 xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime·xadd(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·xadd64(SB), NOSPLIT, $0-24
+	MOVL	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xchg(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·xchg64(SB), NOSPLIT, $0-24
+	MOVL	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime·xchgp(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
+	JMP	runtime·xchg(SB)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	MOVL	cycles+0(FP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
+TEXT runtime·atomicstorep(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
+	MOVL	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+// void	runtime·atomicor8(byte volatile*, byte);
+TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), BX
+	MOVB	val+4(FP), AX
+	LOCK
+	ORB	AX, 0(BX)
+	RET
+
+// void jmpdefer(fn, sp);
+// called from deferreturn.
+// 1. pop the caller
+// 2. sub 5 bytes from the callers return
+// 3. jmp to the argument
+TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
+	MOVL	fv+0(FP), DX
+	MOVL	argp+4(FP), BX
+	LEAL	-8(BX), SP	// caller sp after CALL
+	SUBL	$5, (SP)	// return to CALL again
+	MOVL	0(DX), BX
+	JMP	BX	// but first run the deferred function
+
+// asmcgocall(void(*fn)(void*), void *arg)
+// Not implemented.
+TEXT runtime·asmcgocall(SB),NOSPLIT,$0-8
+	MOVL	0, AX
+	RET
+
+// asmcgocall(void(*fn)(void*), void *arg)
+// Not implemented.
+TEXT runtime·asmcgocall_errno(SB),NOSPLIT,$0-12
+	MOVL	0, AX
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// Not implemented.
+TEXT runtime·cgocallback(SB),NOSPLIT,$0-12
+	MOVL	0, AX
+	RET
+
+// void setg(G*); set g. for use by needm.
+// Not implemented.
+TEXT runtime·setg(SB), NOSPLIT, $0-4
+	MOVL	0, AX
+	RET
+
+// check that SP is in range [g->stack.lo, g->stack.hi)
+TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
+	get_tls(CX)
+	MOVL	g(CX), AX
+	CMPL	(g_stack+stack_hi)(AX), SP
+	JHI	2(PC)
+	MOVL	0, AX
+	CMPL	SP, (g_stack+stack_lo)(AX)
+	JHI	2(PC)
+	MOVL	0, AX
+	RET
+
+TEXT runtime·memclr(SB),NOSPLIT,$0-8
+	MOVL	ptr+0(FP), DI
+	MOVL	n+4(FP), CX
+	MOVQ	CX, BX
+	ANDQ	$7, BX
+	SHRQ	$3, CX
+	MOVQ	$0, AX
+	CLD
+	REP
+	STOSQ
+	MOVQ	BX, CX
+	REP
+	STOSB
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$0-12
+	MOVL	argp+0(FP),AX		// addr of first arg
+	MOVL	-8(AX),AX		// get calling pc
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·gogetcallerpc(SB),NOSPLIT,$0-12
+	MOVL	p+0(FP),AX		// addr of first arg
+	MOVL	-8(AX),AX		// get calling pc
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$0-8
+	MOVL	argp+0(FP),AX		// addr of first arg
+	MOVL	pc+4(FP), BX		// pc to set
+	MOVQ	BX, -8(AX)		// set calling pc
+	RET
+
+TEXT runtime·getcallersp(SB),NOSPLIT,$0-12
+	MOVL	argp+0(FP), AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+// func gogetcallersp(p unsafe.Pointer) uintptr
+TEXT runtime·gogetcallersp(SB),NOSPLIT,$0-12
+	MOVL	p+0(FP),AX		// addr of first arg
+	MOVL	AX, ret+8(FP)
+	RET
+
+// int64 runtime·cputicks(void)
+TEXT runtime·cputicks(SB),NOSPLIT,$0-0
+	RDTSC
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+// hash function using AES hardware instructions
+// For now, our one amd64p32 system (NaCl) does not
+// support using AES instructions, so have not bothered to
+// write the implementations. Can copy and adjust the ones
+// in asm_amd64.s when the time comes.
+
+TEXT runtime·aeshash(SB),NOSPLIT,$0-20
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·aeshashstr(SB),NOSPLIT,$0-20
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·aeshash32(SB),NOSPLIT,$0-20
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·aeshash64(SB),NOSPLIT,$0-20
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·memeq(SB),NOSPLIT,$0-17
+	MOVL	a+0(FP), SI
+	MOVL	b+4(FP), DI
+	MOVL	size+8(FP), BX
+	CALL	runtime·memeqbody(SB)
+	MOVB	AX, ret+16(FP)
+	RET
+
+// eqstring tests whether two strings are equal.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT,$0-17
+	MOVL	s1len+4(FP), AX
+	MOVL	s2len+12(FP), BX
+	CMPL	AX, BX
+	JNE	different
+	MOVL	s1str+0(FP), SI
+	MOVL	s2str+8(FP), DI
+	CMPL	SI, DI
+	JEQ	same
+	CALL	runtime·memeqbody(SB)
+	MOVB	AX, v+16(FP)
+	RET
+same:
+	MOVB	$1, v+16(FP)
+	RET
+different:
+	MOVB	$0, v+16(FP)
+	RET
+
+// a in SI
+// b in DI
+// count in BX
+TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
+	XORQ	AX, AX
+
+	CMPQ	BX, $8
+	JB	small
+	
+	// 64 bytes at a time using xmm registers
+hugeloop:
+	CMPQ	BX, $64
+	JB	bigloop
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	MOVOU	16(SI), X2
+	MOVOU	16(DI), X3
+	MOVOU	32(SI), X4
+	MOVOU	32(DI), X5
+	MOVOU	48(SI), X6
+	MOVOU	48(DI), X7
+	PCMPEQB	X1, X0
+	PCMPEQB	X3, X2
+	PCMPEQB	X5, X4
+	PCMPEQB	X7, X6
+	PAND	X2, X0
+	PAND	X6, X4
+	PAND	X4, X0
+	PMOVMSKB X0, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffff
+	JEQ	hugeloop
+	RET
+
+	// 8 bytes at a time using 64-bit register
+bigloop:
+	CMPQ	BX, $8
+	JBE	leftover
+	MOVQ	(SI), CX
+	MOVQ	(DI), DX
+	ADDQ	$8, SI
+	ADDQ	$8, DI
+	SUBQ	$8, BX
+	CMPQ	CX, DX
+	JEQ	bigloop
+	RET
+
+	// remaining 0-8 bytes
+leftover:
+	ADDQ	BX, SI
+	ADDQ	BX, DI
+	MOVQ	-8(SI), CX
+	MOVQ	-8(DI), DX
+	CMPQ	CX, DX
+	SETEQ	AX
+	RET
+
+small:
+	CMPQ	BX, $0
+	JEQ	equal
+
+	LEAQ	0(BX*8), CX
+	NEGQ	CX
+
+	CMPB	SI, $0xf8
+	JA	si_high
+
+	// load at SI won't cross a page boundary.
+	MOVQ	(SI), SI
+	JMP	si_finish
+si_high:
+	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
+	MOVQ	BX, DX
+	ADDQ	SI, DX
+	MOVQ	-8(DX), SI
+	SHRQ	CX, SI
+si_finish:
+
+	// same for DI.
+	CMPB	DI, $0xf8
+	JA	di_high
+	MOVQ	(DI), DI
+	JMP	di_finish
+di_high:
+	MOVQ	BX, DX
+	ADDQ	DI, DX
+	MOVQ	-8(DX), DI
+	SHRQ	CX, DI
+di_finish:
+
+	SUBQ	SI, DI
+	SHLQ	CX, DI
+equal:
+	SETEQ	AX
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
+	MOVL	s1_base+0(FP), SI
+	MOVL	s1_len+4(FP), BX
+	MOVL	s2_base+8(FP), DI
+	MOVL	s2_len+12(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·cmpbytes(SB),NOSPLIT,$0-28
+	MOVL	s1+0(FP), SI
+	MOVL	s1+4(FP), BX
+	MOVL	s2+12(FP), DI
+	MOVL	s2+16(FP), DX
+	CALL	runtime·cmpbody(SB)
+	MOVQ	AX, res+24(FP)
+	RET
+
+// input:
+//   SI = a
+//   DI = b
+//   BX = alen
+//   DX = blen
+// output:
+//   AX = 1/0/-1
+TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
+	CMPQ	SI, DI
+	JEQ	cmp_allsame
+	CMPQ	BX, DX
+	MOVQ	DX, R8
+	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+	CMPQ	R8, $8
+	JB	cmp_small
+
+cmp_loop:
+	CMPQ	R8, $16
+	JBE	cmp_0through16
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX	// convert EQ to NE
+	JNE	cmp_diff16	// branch if at least one byte is not equal
+	ADDQ	$16, SI
+	ADDQ	$16, DI
+	SUBQ	$16, R8
+	JMP	cmp_loop
+	
+	// AX = bit mask of differences
+cmp_diff16:
+	BSFQ	AX, BX	// index of first byte that differs
+	XORQ	AX, AX
+	ADDQ	BX, SI
+	MOVB	(SI), CX
+	ADDQ	BX, DI
+	CMPB	CX, (DI)
+	SETHI	AX
+	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
+	RET
+
+	// 0 through 16 bytes left, alen>=8, blen>=8
+cmp_0through16:
+	CMPQ	R8, $8
+	JBE	cmp_0through8
+	MOVQ	(SI), AX
+	MOVQ	(DI), CX
+	CMPQ	AX, CX
+	JNE	cmp_diff8
+cmp_0through8:
+	ADDQ	R8, SI
+	ADDQ	R8, DI
+	MOVQ	-8(SI), AX
+	MOVQ	-8(DI), CX
+	CMPQ	AX, CX
+	JEQ	cmp_allsame
+
+	// AX and CX contain parts of a and b that differ.
+cmp_diff8:
+	BSWAPQ	AX	// reverse order of bytes
+	BSWAPQ	CX
+	XORQ	AX, CX
+	BSRQ	CX, CX	// index of highest bit difference
+	SHRQ	CX, AX	// move a's bit to bottom
+	ANDQ	$1, AX	// mask bit
+	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
+	RET
+
+	// 0-7 bytes in common
+cmp_small:
+	LEAQ	(R8*8), CX	// bytes left -> bits left
+	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
+	JEQ	cmp_allsame
+
+	// load bytes of a into high bytes of AX
+	CMPB	SI, $0xf8
+	JA	cmp_si_high
+	MOVQ	(SI), SI
+	JMP	cmp_si_finish
+cmp_si_high:
+	ADDQ	R8, SI
+	MOVQ	-8(SI), SI
+	SHRQ	CX, SI
+cmp_si_finish:
+	SHLQ	CX, SI
+
+	// load bytes of b in to high bytes of BX
+	CMPB	DI, $0xf8
+	JA	cmp_di_high
+	MOVQ	(DI), DI
+	JMP	cmp_di_finish
+cmp_di_high:
+	ADDQ	R8, DI
+	MOVQ	-8(DI), DI
+	SHRQ	CX, DI
+cmp_di_finish:
+	SHLQ	CX, DI
+
+	BSWAPQ	SI	// reverse order of bytes
+	BSWAPQ	DI
+	XORQ	SI, DI	// find bit differences
+	JEQ	cmp_allsame
+	BSRQ	DI, CX	// index of highest bit difference
+	SHRQ	CX, SI	// move a's bit to bottom
+	ANDQ	$1, SI	// mask bit
+	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
+	RET
+
+cmp_allsame:
+	XORQ	AX, AX
+	XORQ	CX, CX
+	CMPQ	BX, DX
+	SETGT	AX	// 1 if alen > blen
+	SETEQ	CX	// 1 if alen == blen
+	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0
+	MOVL s+0(FP), SI
+	MOVL s_len+4(FP), BX
+	MOVB c+12(FP), AL
+	CALL runtime·indexbytebody(SB)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0
+	MOVL s+0(FP), SI
+	MOVL s_len+4(FP), BX
+	MOVB c+8(FP), AL
+	CALL runtime·indexbytebody(SB)
+	MOVL AX, ret+16(FP)
+	RET
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+// output:
+//   AX
+TEXT runtime·indexbytebody(SB),NOSPLIT,$0
+	MOVL SI, DI
+
+	CMPL BX, $16
+	JLT indexbyte_small
+
+	// round up to first 16-byte boundary
+	TESTL $15, SI
+	JZ aligned
+	MOVL SI, CX
+	ANDL $~15, CX
+	ADDL $16, CX
+
+	// search the beginning
+	SUBL SI, CX
+	REPN; SCASB
+	JZ success
+
+// DI is 16-byte aligned; get ready to search using SSE instructions
+aligned:
+	// round down to last 16-byte boundary
+	MOVL BX, R11
+	ADDL SI, R11
+	ANDL $~15, R11
+
+	// shuffle X0 around so that each byte contains c
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+	JMP condition
+
+sse:
+	// move the next 16-byte chunk of the buffer into X1
+	MOVO (DI), X1
+	// compare bytes in X0 to X1
+	PCMPEQB X0, X1
+	// take the top bit of each byte in X1 and put the result in DX
+	PMOVMSKB X1, DX
+	TESTL DX, DX
+	JNZ ssesuccess
+	ADDL $16, DI
+
+condition:
+	CMPL DI, R11
+	JLT sse
+
+	// search the end
+	MOVL SI, CX
+	ADDL BX, CX
+	SUBL R11, CX
+	// if CX == 0, the zero flag will be set and we'll end up
+	// returning a false success
+	JZ failure
+	REPN; SCASB
+	JZ success
+
+failure:
+	MOVL $-1, AX
+	RET
+
+// handle for lengths < 16
+indexbyte_small:
+	MOVL BX, CX
+	REPN; SCASB
+	JZ success
+	MOVL $-1, AX
+	RET
+
+// we've found the chunk containing the byte
+// now just figure out which specific byte it is
+ssesuccess:
+	// get the index of the least significant set bit
+	BSFW DX, DX
+	SUBL SI, DI
+	ADDL DI, DX
+	MOVL DX, AX
+	RET
+
+success:
+	SUBL SI, DI
+	SUBL $1, DI
+	MOVL DI, AX
+	RET
+
+TEXT bytes·Equal(SB),NOSPLIT,$0-25
+	MOVL	a_len+4(FP), BX
+	MOVL	b_len+16(FP), CX
+	XORL	AX, AX
+	CMPL	BX, CX
+	JNE	eqret
+	MOVL	a+0(FP), SI
+	MOVL	b+12(FP), DI
+	CALL	runtime·memeqbody(SB)
+eqret:
+	MOVB	AX, ret+24(FP)
+	RET
+
+TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), AX
+	MOVL	m_fastrand(AX), DX
+	ADDL	DX, DX
+	MOVL	DX, BX
+	XORL	$0x88888eef, DX
+	CMOVLMI	BX, DX
+	MOVL	DX, m_fastrand(AX)
+	MOVL	DX, ret+0(FP)
+	RET
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVL	$0, AX
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT,$0-0
+	BYTE	$0x90	// NOP
+	CALL	runtime·goexit1(SB)	// does not return
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
new file mode 100644
index 000000000..0f3b5eeb8
--- /dev/null
+++ b/src/runtime/asm_arm.s
@@ -0,0 +1,1328 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// using frame size $-4 means do not save LR on stack.
+TEXT runtime·rt0_go(SB),NOSPLIT,$-4
+	MOVW	$0xcafebabe, R12
+
+	// copy arguments forward on an even stack
+	// use R13 instead of SP to avoid linker rewriting the offsets
+	MOVW	0(R13), R0		// argc
+	MOVW	4(R13), R1		// argv
+	SUB	$64, R13		// plenty of scratch
+	AND	$~7, R13
+	MOVW	R0, 60(R13)		// save argc, argv away
+	MOVW	R1, 64(R13)
+
+	// set up g register
+	// g is R10
+	MOVW	$runtime·g0(SB), g
+	MOVW	$runtime·m0(SB), R8
+
+	// save m->g0 = g0
+	MOVW	g, m_g0(R8)
+	// save g->m = m0
+	MOVW	R8, g_m(g)
+
+	// create istack out of the OS stack
+	MOVW	$(-8192+104)(R13), R0
+	MOVW	R0, g_stackguard0(g)
+	MOVW	R0, g_stackguard1(g)
+	MOVW	R0, (g_stack+stack_lo)(g)
+	MOVW	R13, (g_stack+stack_hi)(g)
+
+	BL	runtime·emptyfunc(SB)	// fault if stack check is wrong
+
+#ifndef GOOS_nacl
+	// if there is an _cgo_init, call it.
+	MOVW	_cgo_init(SB), R4
+	CMP	$0, R4
+	B.EQ	nocgo
+	MRC     15, 0, R0, C13, C0, 3 	// load TLS base pointer
+	MOVW 	R0, R3 			// arg 3: TLS base pointer
+	MOVW 	$runtime·tlsg(SB), R2 	// arg 2: tlsg
+	MOVW	$setg_gcc<>(SB), R1 	// arg 1: setg
+	MOVW	g, R0 			// arg 0: G
+	BL	(R4) // will clobber R0-R3
+#endif
+
+nocgo:
+	// update stackguard after _cgo_init
+	MOVW	(g_stack+stack_lo)(g), R0
+	ADD	$const_StackGuard, R0
+	MOVW	R0, g_stackguard0(g)
+	MOVW	R0, g_stackguard1(g)
+
+	BL	runtime·checkgoarm(SB)
+	BL	runtime·check(SB)
+
+	// saved argc, argv
+	MOVW	60(R13), R0
+	MOVW	R0, 4(R13)
+	MOVW	64(R13), R1
+	MOVW	R1, 8(R13)
+	BL	runtime·args(SB)
+	BL	runtime·osinit(SB)
+	BL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVW	$runtime·main·f(SB), R0
+	MOVW.W	R0, -4(R13)
+	MOVW	$8, R0
+	MOVW.W	R0, -4(R13)
+	MOVW	$0, R0
+	MOVW.W	R0, -4(R13)	// push $0 as guard
+	BL	runtime·newproc(SB)
+	MOVW	$12(R13), R13	// pop args and LR
+
+	// start this M
+	BL	runtime·mstart(SB)
+
+	MOVW	$1234, R0
+	MOVW	$1000, R1
+	MOVW	R0, (R1)	// fail hard
+
+DATA	runtime·main·f+0(SB)/4,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),RODATA,$4
+
+TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
+	// gdb won't skip this breakpoint instruction automatically,
+	// so you must manually "set $pc+=4" to skip it and continue.
+#ifdef GOOS_nacl
+	WORD	$0xe125be7f	// BKPT 0x5bef, NACL_INSTR_ARM_BREAKPOINT
+#else
+	WORD	$0xe7f001f0	// undefined instruction that gdb understands is a software breakpoint
+#endif
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT,$0-0
+	// disable runfast (flush-to-zero) mode of vfp if runtime.goarm > 5
+	MOVB	runtime·goarm(SB), R11
+	CMP	$5, R11
+	BLE	4(PC)
+	WORD	$0xeef1ba10	// vmrs r11, fpscr
+	BIC	$(1<<24), R11
+	WORD	$0xeee1ba10	// vmsr fpscr, r11
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB),NOSPLIT,$-4-4
+	MOVW	0(FP), R0		// gobuf
+	MOVW	SP, gobuf_sp(R0)
+	MOVW	LR, gobuf_pc(R0)
+	MOVW	g, gobuf_g(R0)
+	MOVW	$0, R11
+	MOVW	R11, gobuf_lr(R0)
+	MOVW	R11, gobuf_ret(R0)
+	MOVW	R11, gobuf_ctxt(R0)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB),NOSPLIT,$-4-4
+	MOVW	0(FP), R1		// gobuf
+	MOVW	gobuf_g(R1), R0
+	BL	setg<>(SB)
+
+	// NOTE: We updated g above, and we are about to update SP.
+	// Until LR and PC are also updated, the g/SP/LR/PC quadruple
+	// are out of sync and must not be used as the basis of a traceback.
+	// Sigprof skips the traceback when SP is not within g's bounds,
+	// and when the PC is inside this function, runtime.gogo.
+	// Since we are about to update SP, until we complete runtime.gogo
+	// we must not leave this function. In particular, no calls
+	// after this point: it must be straight-line code until the
+	// final B instruction.
+	// See large comment in sigprof for more details.
+	MOVW	gobuf_sp(R1), SP	// restore SP
+	MOVW	gobuf_lr(R1), LR
+	MOVW	gobuf_ret(R1), R0
+	MOVW	gobuf_ctxt(R1), R7
+	MOVW	$0, R11
+	MOVW	R11, gobuf_sp(R1)	// clear to help garbage collector
+	MOVW	R11, gobuf_ret(R1)
+	MOVW	R11, gobuf_lr(R1)
+	MOVW	R11, gobuf_ctxt(R1)
+	MOVW	gobuf_pc(R1), R11
+	CMP	R11, R11 // set condition codes for == test, needed by stack split
+	B	(R11)
+
+// func mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB),NOSPLIT,$-4-4
+	// Save caller state in g->sched.
+	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	LR, (g_sched+gobuf_pc)(g)
+	MOVW	$0, R11
+	MOVW	R11, (g_sched+gobuf_lr)(g)
+	MOVW	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOVW	g, R1
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R0
+	BL	setg<>(SB)
+	CMP	g, R1
+	B.NE	2(PC)
+	B	runtime·badmcall(SB)
+	MOVB	runtime·iscgo(SB), R11
+	CMP	$0, R11
+	BL.NE	runtime·save_g(SB)
+	MOVW	fn+0(FP), R0
+	MOVW	(g_sched+gobuf_sp)(g), SP
+	SUB	$8, SP
+	MOVW	R1, 4(SP)
+	MOVW	R0, R7
+	MOVW	0(R0), R0
+	BL	(R0)
+	B	runtime·badmcall2(SB)
+	RET
+
+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB),NOSPLIT,$0-0
+	MOVW	$0, R0
+	BL	(R0) // clobber lr to ensure push {lr} is kept
+	RET
+
+// func onM_signalok(fn func())
+TEXT runtime·onM_signalok(SB), NOSPLIT, $-4-4
+	MOVW	g_m(g), R1
+	MOVW	m_gsignal(R1), R2
+	CMP	g, R2
+	B.EQ	ongsignal
+	B	runtime·onM(SB)
+
+ongsignal:
+	MOVW	fn+0(FP), R0
+	MOVW	R0, R7
+	MOVW	0(R0), R0
+	BL	(R0)
+	RET
+
+// func onM(fn func())
+TEXT runtime·onM(SB),NOSPLIT,$0-4
+	MOVW	fn+0(FP), R0	// R0 = fn
+	MOVW	g_m(g), R1	// R1 = m
+
+	MOVW	m_g0(R1), R2	// R2 = g0
+	CMP	g, R2
+	B.EQ	onm
+
+	MOVW	m_curg(R1), R3
+	CMP	g, R3
+	B.EQ	oncurg
+
+	// Not g0, not curg. Must be gsignal, but that's not allowed.
+	// Hide call from linker nosplit analysis.
+	MOVW	$runtime·badonm(SB), R0
+	BL	(R0)
+
+oncurg:
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVW	$runtime·switchtoM(SB), R3
+	ADD	$4, R3, R3 // get past push {lr}
+	MOVW	R3, (g_sched+gobuf_pc)(g)
+	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	LR, (g_sched+gobuf_lr)(g)
+	MOVW	g, (g_sched+gobuf_g)(g)
+
+	// switch to g0
+	MOVW	R0, R5
+	MOVW	R2, R0
+	BL	setg<>(SB)
+	MOVW	R5, R0
+	MOVW	(g_sched+gobuf_sp)(R2), R3
+	// make it look like mstart called onM on g0, to stop traceback
+	SUB	$4, R3, R3
+	MOVW	$runtime·mstart(SB), R4
+	MOVW	R4, 0(R3)
+	MOVW	R3, SP
+
+	// call target function
+	MOVW	R0, R7
+	MOVW	0(R0), R0
+	BL	(R0)
+
+	// switch back to g
+	MOVW	g_m(g), R1
+	MOVW	m_curg(R1), R0
+	BL	setg<>(SB)
+	MOVW	(g_sched+gobuf_sp)(g), SP
+	MOVW	$0, R3
+	MOVW	R3, (g_sched+gobuf_sp)(g)
+	RET
+
+onm:
+	MOVW	R0, R7
+	MOVW	0(R0), R0
+	BL	(R0)
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+// R1 frame size
+// R2 arg size
+// R3 prolog's LR
+// NB. we do not save R0 because we've forced 5c to pass all arguments
+// on the stack.
+// using frame size $-4 means do not save LR on stack.
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$-4-0
+	// Cannot grow scheduler stack (m->g0).
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R4
+	CMP	g, R4
+	BL.EQ	runtime·abort(SB)
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVW	m_gsignal(R8), R4
+	CMP	g, R4
+	BL.EQ	runtime·abort(SB)
+
+	// Called from f.
+	// Set g->sched to context in f.
+	MOVW	R7, (g_sched+gobuf_ctxt)(g)
+	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	LR, (g_sched+gobuf_pc)(g)
+	MOVW	R3, (g_sched+gobuf_lr)(g)
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVW	R3, (m_morebuf+gobuf_pc)(R8)	// f's caller's PC
+	MOVW	SP, (m_morebuf+gobuf_sp)(R8)	// f's caller's SP
+	MOVW	$4(SP), R3			// f's argument pointer
+	MOVW	g, (m_morebuf+gobuf_g)(R8)
+
+	// Call newstack on m->g0's stack.
+	MOVW	m_g0(R8), R0
+	BL	setg<>(SB)
+	MOVW	(g_sched+gobuf_sp)(g), SP
+	BL	runtime·newstack(SB)
+
+	// Not reached, but make sure the return PC from the call to newstack
+	// is still in this function, and not the beginning of the next.
+	RET
+
+TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-4-0
+	MOVW	$0, R7
+	B runtime·morestack(SB)
+
+// reflectcall: call a function with the given argument list
+// func call(f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	CMP	$MAXSIZE, R0;		\
+	B.HI	3(PC);			\
+	MOVW	$NAME(SB), R1;		\
+	B	(R1)
+
+TEXT ·reflectcall(SB),NOSPLIT,$-4-16
+	MOVW	argsize+8(FP), R0
+	DISPATCH(runtime·call16, 16)
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVW	$runtime·badreflectcall(SB), R1
+	B	(R1)
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-16;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVW	argptr+4(FP), R0;		\
+	MOVW	argsize+8(FP), R2;		\
+	ADD	$4, SP, R1;			\
+	CMP	$0, R2;				\
+	B.EQ	5(PC);				\
+	MOVBU.P	1(R0), R5;			\
+	MOVBU.P R5, 1(R1);			\
+	SUB	$1, R2, R2;			\
+	B	-5(PC);				\
+	/* call function */			\
+	MOVW	f+0(FP), R7;			\
+	MOVW	(R7), R0;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	BL	(R0);				\
+	/* copy return values back */		\
+	MOVW	argptr+4(FP), R0;		\
+	MOVW	argsize+8(FP), R2;		\
+	MOVW	retoffset+12(FP), R3;		\
+	ADD	$4, SP, R1;			\
+	ADD	R3, R1;				\
+	ADD	R3, R0;				\
+	SUB	R3, R2;				\
+	CMP	$0, R2;				\
+	RET.EQ	;				\
+	MOVBU.P	1(R1), R5;			\
+	MOVBU.P R5, 1(R0);			\
+	SUB	$1, R2, R2;			\
+	B	-5(PC)				\
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+// void jmpdefer(fn, sp);
+// called from deferreturn.
+// 1. grab stored LR for caller
+// 2. sub 4 bytes to get back to BL deferreturn
+// 3. B to fn
+// TODO(rsc): Push things on stack and then use pop
+// to load all registers simultaneously, so that a profiling
+// interrupt can never see mismatched SP/LR/PC.
+// (And double-check that pop is atomic in that way.)
+TEXT runtime·jmpdefer(SB),NOSPLIT,$0-8
+	MOVW	0(SP), LR
+	MOVW	$-4(LR), LR	// BL deferreturn
+	MOVW	fv+0(FP), R7
+	MOVW	argp+4(FP), SP
+	MOVW	$-4(SP), SP	// SP is 4 below argp, due to saved LR
+	MOVW	0(R7), R1
+	B	(R1)
+
+// Save state of caller into g->sched. Smashes R11.
+TEXT gosave<>(SB),NOSPLIT,$0
+	MOVW	LR, (g_sched+gobuf_pc)(g)
+	MOVW	R13, (g_sched+gobuf_sp)(g)
+	MOVW	$0, R11
+	MOVW	R11, (g_sched+gobuf_lr)(g)
+	MOVW	R11, (g_sched+gobuf_ret)(g)
+	MOVW	R11, (g_sched+gobuf_ctxt)(g)
+	RET
+
+// asmcgocall(void(*fn)(void*), void *arg)
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.c for more details.
+TEXT	·asmcgocall(SB),NOSPLIT,$0-8
+	MOVW	fn+0(FP), R1
+	MOVW	arg+4(FP), R0
+	BL	asmcgocall<>(SB)
+	RET
+
+TEXT ·asmcgocall_errno(SB),NOSPLIT,$0-12
+	MOVW	fn+0(FP), R1
+	MOVW	arg+4(FP), R0
+	BL	asmcgocall<>(SB)
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT asmcgocall<>(SB),NOSPLIT,$0-0
+	// fn in R1, arg in R0.
+	MOVW	R13, R2
+	MOVW	g, R4
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R3
+	CMP	R3, g
+	BEQ	asmcgocall_g0
+	BL	gosave<>(SB)
+	MOVW	R0, R5
+	MOVW	R3, R0
+	BL	setg<>(SB)
+	MOVW	R5, R0
+	MOVW	(g_sched+gobuf_sp)(g), R13
+
+	// Now on a scheduling stack (a pthread-created stack).
+asmcgocall_g0:
+	SUB	$24, R13
+	BIC	$0x7, R13	// alignment for gcc ABI
+	MOVW	R4, 20(R13) // save old g
+	MOVW	(g_stack+stack_hi)(R4), R4
+	SUB	R2, R4
+	MOVW	R4, 16(R13)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
+	BL	(R1)
+
+	// Restore registers, g, stack pointer.
+	MOVW	R0, R5
+	MOVW	20(R13), R0
+	BL	setg<>(SB)
+	MOVW	(g_stack+stack_hi)(g), R1
+	MOVW	16(R13), R2
+	SUB	R2, R1
+	MOVW	R5, R0
+	MOVW	R1, R13
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
+	MOVW	$fn+0(FP), R0
+	MOVW	R0, 4(R13)
+	MOVW	frame+4(FP), R0
+	MOVW	R0, 8(R13)
+	MOVW	framesize+8(FP), R0
+	MOVW	R0, 12(R13)
+	MOVW	$runtime·cgocallback_gofunc(SB), R0
+	BL	(R0)
+	RET
+
+// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT	·cgocallback_gofunc(SB),NOSPLIT,$8-12
+	NO_LOCAL_POINTERS
+	
+	// Load m and g from thread-local storage.
+	MOVB	runtime·iscgo(SB), R0
+	CMP	$0, R0
+	BL.NE	runtime·load_g(SB)
+
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call.
+	CMP	$0, g
+	B.NE	havem
+	MOVW	g, savedm-4(SP) // g is zero, so is m.
+	MOVW	$runtime·needm(SB), R0
+	BL	(R0)
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then onM will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R3
+	MOVW	R13, (g_sched+gobuf_sp)(R3)
+
+havem:
+	MOVW	g_m(g), R8
+	MOVW	R8, savedm-4(SP)
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
+	MOVW	m_g0(R8), R3
+	MOVW	(g_sched+gobuf_sp)(R3), R4
+	MOVW	R4, savedsp-8(SP)
+	MOVW	R13, (g_sched+gobuf_sp)(R3)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	//
+	// In the new goroutine, -8(SP) and -4(SP) are unused.
+	MOVW	m_curg(R8), R0
+	BL	setg<>(SB)
+	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
+	MOVW	(g_sched+gobuf_pc)(g), R5
+	MOVW	R5, -12(R4)
+	MOVW	$-12(R4), R13
+	BL	runtime·cgocallbackg(SB)
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	MOVW	0(R13), R5
+	MOVW	R5, (g_sched+gobuf_pc)(g)
+	MOVW	$12(R13), R4
+	MOVW	R4, (g_sched+gobuf_sp)(g)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVW	g_m(g), R8
+	MOVW	m_g0(R8), R0
+	BL	setg<>(SB)
+	MOVW	(g_sched+gobuf_sp)(g), R13
+	MOVW	savedsp-8(SP), R4
+	MOVW	R4, (g_sched+gobuf_sp)(g)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVW	savedm-4(SP), R6
+	CMP	$0, R6
+	B.NE	3(PC)
+	MOVW	$runtime·dropm(SB), R0
+	BL	(R0)
+
+	// Done!
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB),NOSPLIT,$-4-4
+	MOVW	gg+0(FP), R0
+	B	setg<>(SB)
+
+TEXT setg<>(SB),NOSPLIT,$-4-0
+	MOVW	R0, g
+
+	// Save g to thread-local storage.
+	MOVB	runtime·iscgo(SB), R0
+	CMP	$0, R0
+	B.EQ	2(PC)
+	B	runtime·save_g(SB)
+
+	MOVW	g, R0
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$-4-4
+	MOVW	0(SP), R0
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·gogetcallerpc(SB),NOSPLIT,$-4-8
+	MOVW	R14, ret+4(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$-4-8
+	MOVW	pc+4(FP), R0
+	MOVW	R0, 0(SP)
+	RET
+
+TEXT runtime·getcallersp(SB),NOSPLIT,$-4-4
+	MOVW	0(FP), R0
+	MOVW	$-4(R0), R0
+	MOVW	R0, ret+4(FP)
+	RET
+
+// func gogetcallersp(p unsafe.Pointer) uintptr
+TEXT runtime·gogetcallersp(SB),NOSPLIT,$-4-8
+	MOVW	0(FP), R0
+	MOVW	$-4(R0), R0
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·emptyfunc(SB),0,$0-0
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT,$-4-0
+	MOVW	$0, R0
+	MOVW	(R0), R1
+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+//
+// To implement runtime·cas in sys_$GOOS_arm.s
+// using the native instructions, use:
+//
+//	TEXT runtime·cas(SB),NOSPLIT,$0
+//		B	runtime·armcas(SB)
+//
+TEXT runtime·armcas(SB),NOSPLIT,$0-13
+	MOVW	valptr+0(FP), R1
+	MOVW	old+4(FP), R2
+	MOVW	new+8(FP), R3
+casl:
+	LDREX	(R1), R0
+	CMP	R0, R2
+	BNE	casfail
+	STREX	R3, (R1), R0
+	CMP	$0, R0
+	BNE	casl
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+casfail:
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT runtime·casuintptr(SB),NOSPLIT,$0-13
+	B	runtime·cas(SB)
+
+TEXT runtime·atomicloaduintptr(SB),NOSPLIT,$0-8
+	B	runtime·atomicload(SB)
+
+TEXT runtime·atomicloaduint(SB),NOSPLIT,$0-8
+	B	runtime·atomicload(SB)
+
+TEXT runtime·atomicstoreuintptr(SB),NOSPLIT,$0-8
+	B	runtime·atomicstore(SB)
+
+// AES hashing not implemented for ARM
+TEXT runtime·aeshash(SB),NOSPLIT,$-4-0
+	MOVW	$0, R0
+	MOVW	(R0), R1
+TEXT runtime·aeshash32(SB),NOSPLIT,$-4-0
+	MOVW	$0, R0
+	MOVW	(R0), R1
+TEXT runtime·aeshash64(SB),NOSPLIT,$-4-0
+	MOVW	$0, R0
+	MOVW	(R0), R1
+TEXT runtime·aeshashstr(SB),NOSPLIT,$-4-0
+	MOVW	$0, R0
+	MOVW	(R0), R1
+
+TEXT runtime·memeq(SB),NOSPLIT,$-4-13
+	MOVW	a+0(FP), R1
+	MOVW	b+4(FP), R2
+	MOVW	size+8(FP), R3
+	ADD	R1, R3, R6
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+_next2:
+	CMP	R1, R6
+	RET.EQ
+	MOVBU.P	1(R1), R4
+	MOVBU.P	1(R2), R5
+	CMP	R4, R5
+	BEQ	_next2
+
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+// eqstring tests whether two strings are equal.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT,$-4-17
+	MOVW	s1len+4(FP), R0
+	MOVW	s2len+12(FP), R1
+	MOVW	$0, R7
+	CMP	R0, R1
+	MOVB.NE R7, v+16(FP)
+	RET.NE
+	MOVW	s1str+0(FP), R2
+	MOVW	s2str+8(FP), R3
+	MOVW	$1, R8
+	MOVB	R8, v+16(FP)
+	CMP	R2, R3
+	RET.EQ
+	ADD	R2, R0, R6
+_eqnext:
+	CMP	R2, R6
+	RET.EQ
+	MOVBU.P	1(R2), R4
+	MOVBU.P	1(R3), R5
+	CMP	R4, R5
+	BEQ	_eqnext
+	MOVB	R7, v+16(FP)
+	RET
+
+// void setg_gcc(G*); set g called from gcc.
+TEXT setg_gcc<>(SB),NOSPLIT,$0
+	MOVW	R0, g
+	B		runtime·save_g(SB)
+
+// TODO: share code with memeq?
+TEXT bytes·Equal(SB),NOSPLIT,$0
+	MOVW	a_len+4(FP), R1
+	MOVW	b_len+16(FP), R3
+	
+	CMP	R1, R3		// unequal lengths are not equal
+	B.NE	_notequal
+
+	MOVW	a+0(FP), R0
+	MOVW	b+12(FP), R2
+	ADD	R0, R1		// end
+
+_byteseq_next:
+	CMP	R0, R1
+	B.EQ	_equal		// reached the end
+	MOVBU.P	1(R0), R4
+	MOVBU.P	1(R2), R5
+	CMP	R4, R5
+	B.EQ	_byteseq_next
+
+_notequal:
+	MOVW	$0, R0
+	MOVBU	R0, ret+24(FP)
+	RET
+
+_equal:
+	MOVW	$1, R0
+	MOVBU	R0, ret+24(FP)
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0
+	MOVW	s+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+12(FP), R2	// byte to find
+	MOVW	R0, R4		// store base for later
+	ADD	R0, R1		// end 
+
+_loop:
+	CMP	R0, R1
+	B.EQ	_notfound
+	MOVBU.P	1(R0), R3
+	CMP	R2, R3
+	B.NE	_loop
+
+	SUB	$1, R0		// R0 will be one beyond the position we want
+	SUB	R4, R0		// remove base
+	MOVW    R0, ret+16(FP) 
+	RET
+
+_notfound:
+	MOVW	$-1, R0
+	MOVW	R0, ret+16(FP)
+	RET
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0
+	MOVW	s+0(FP), R0
+	MOVW	s_len+4(FP), R1
+	MOVBU	c+8(FP), R2	// byte to find
+	MOVW	R0, R4		// store base for later
+	ADD	R0, R1		// end 
+
+_sib_loop:
+	CMP	R0, R1
+	B.EQ	_sib_notfound
+	MOVBU.P	1(R0), R3
+	CMP	R2, R3
+	B.NE	_sib_loop
+
+	SUB	$1, R0		// R0 will be one beyond the position we want
+	SUB	R4, R0		// remove base
+	MOVW	R0, ret+12(FP) 
+	RET
+
+_sib_notfound:
+	MOVW	$-1, R0
+	MOVW	R0, ret+12(FP)
+	RET
+
+// A Duff's device for zeroing memory.
+// The compiler jumps to computed addresses within
+// this routine to zero chunks of memory.  Do not
+// change this code without also changing the code
+// in ../../cmd/5g/ggen.c:clearfat.
+// R0: zero
+// R1: ptr to memory to be zeroed
+// R1 is updated as a side effect.
+TEXT runtime·duffzero(SB),NOSPLIT,$0-0
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	MOVW.P	R0, 4(R1)
+	RET
+
+// A Duff's device for copying memory.
+// The compiler jumps to computed addresses within
+// this routine to copy chunks of memory.  Source
+// and destination must not overlap.  Do not
+// change this code without also changing the code
+// in ../../cmd/5g/cgen.c:sgen.
+// R0: scratch space
+// R1: ptr to source memory
+// R2: ptr to destination memory
+// R1 and R2 are updated as a side effect
+TEXT runtime·duffcopy(SB),NOSPLIT,$0-0
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	MOVW.P	4(R1), R0
+	MOVW.P	R0, 4(R2)
+	RET
+
+TEXT runtime·fastrand1(SB),NOSPLIT,$-4-4
+	MOVW	g_m(g), R1
+	MOVW	m_fastrand(R1), R0
+	ADD.S	R0, R0
+	EOR.MI	$0x88888eef, R0
+	MOVW	R0, m_fastrand(R1)
+	MOVW	R0, ret+0(FP)
+	RET
+
+TEXT runtime·return0(SB),NOSPLIT,$0
+	MOVW	$0, R0
+	RET
+
+TEXT runtime·procyield(SB),NOSPLIT,$-4
+	MOVW	cycles+0(FP), R1
+	MOVW	$0, R0
+yieldloop:
+	CMP	R0, R1
+	B.NE	2(PC)
+	RET
+	SUB	$1, R1
+	B yieldloop
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT,$8
+	// R11 and g register are clobbered by load_g.  They are
+	// callee-save in the gcc calling convention, so save them here.
+	MOVW	R11, saveR11-4(SP)
+	MOVW	g, saveG-8(SP)
+	
+	BL	runtime·load_g(SB)
+	MOVW	g_m(g), R0
+	MOVW	m_curg(R0), R0
+	MOVW	(g_stack+stack_hi)(R0), R0
+	
+	MOVW	saveG-8(SP), g
+	MOVW	saveR11-4(SP), R11
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT,$-4-0
+	MOVW	R0, R0	// NOP
+	BL	runtime·goexit1(SB)	// does not return
diff --git a/src/runtime/atomic.go b/src/runtime/atomic.go
new file mode 100644
index 000000000..7e9d9b3aa
--- /dev/null
+++ b/src/runtime/atomic.go
@@ -0,0 +1,51 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !arm
+
+package runtime
+
+import "unsafe"
+
+//go:noescape
+func xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func atomicload(ptr *uint32) uint32
+
+//go:noescape
+func atomicload64(ptr *uint64) uint64
+
+//go:noescape
+func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func atomicor8(ptr *uint8, val uint8)
+
+//go:noescape
+func cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func atomicstore(ptr *uint32, val uint32)
+
+//go:noescape
+func atomicstore64(ptr *uint64, val uint64)
+
+//go:noescape
+func atomicstorep(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/atomic_386.c b/src/runtime/atomic_386.c
new file mode 100644
index 000000000..82d36f2d9
--- /dev/null
+++ b/src/runtime/atomic_386.c
@@ -0,0 +1,46 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "textflag.h"
+
+#pragma textflag NOSPLIT
+uint32
+runtime·atomicload(uint32 volatile* addr)
+{
+	return *addr;
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·atomicloadp(void* volatile* addr)
+{
+	return *addr;
+}
+
+#pragma textflag NOSPLIT
+uint64
+runtime·xadd64(uint64 volatile* addr, int64 v)
+{
+	uint64 old;
+
+	do
+		old = *addr;
+	while(!runtime·cas64(addr, old, old+v));
+
+	return old+v;
+}
+
+#pragma textflag NOSPLIT
+uint64
+runtime·xchg64(uint64 volatile* addr, uint64 v)
+{
+	uint64 old;
+
+	do
+		old = *addr;
+	while(!runtime·cas64(addr, old, v));
+
+	return old;
+}
diff --git a/src/runtime/atomic_amd64x.c b/src/runtime/atomic_amd64x.c
new file mode 100644
index 000000000..7be57ac95
--- /dev/null
+++ b/src/runtime/atomic_amd64x.c
@@ -0,0 +1,29 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32
+
+#include "runtime.h"
+#include "textflag.h"
+
+#pragma textflag NOSPLIT
+uint32
+runtime·atomicload(uint32 volatile* addr)
+{
+	return *addr;
+}
+
+#pragma textflag NOSPLIT
+uint64
+runtime·atomicload64(uint64 volatile* addr)
+{
+	return *addr;
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·atomicloadp(void* volatile* addr)
+{
+	return *addr;
+}
diff --git a/src/runtime/atomic_arm.go b/src/runtime/atomic_arm.go
new file mode 100644
index 000000000..b1632cdd1
--- /dev/null
+++ b/src/runtime/atomic_arm.go
@@ -0,0 +1,155 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var locktab [57]struct {
+	l   mutex
+	pad [_CacheLineSize - unsafe.Sizeof(mutex{})]byte
+}
+
+func addrLock(addr *uint64) *mutex {
+	return &locktab[(uintptr(unsafe.Pointer(addr))>>3)%uintptr(len(locktab))].l
+}
+
+// Atomic add and return new value.
+//go:nosplit
+func xadd(val *uint32, delta int32) uint32 {
+	for {
+		oval := *val
+		nval := oval + uint32(delta)
+		if cas(val, oval, nval) {
+			return nval
+		}
+	}
+}
+
+//go:nosplit
+func xchg(addr *uint32, v uint32) uint32 {
+	for {
+		old := *addr
+		if cas(addr, old, v) {
+			return old
+		}
+	}
+}
+
+//go:nosplit
+func xchgp(addr *unsafe.Pointer, v unsafe.Pointer) unsafe.Pointer {
+	for {
+		old := *addr
+		if casp(addr, old, v) {
+			return old
+		}
+	}
+}
+
+//go:nosplit
+func xchguintptr(addr *uintptr, v uintptr) uintptr {
+	return uintptr(xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
+}
+
+//go:nosplit
+func atomicload(addr *uint32) uint32 {
+	return xadd(addr, 0)
+}
+
+//go:nosplit
+func atomicloadp(addr unsafe.Pointer) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(xadd((*uint32)(addr), 0)))
+}
+
+//go:nosplit
+func atomicstorep(addr unsafe.Pointer, v unsafe.Pointer) {
+	for {
+		old := *(*unsafe.Pointer)(addr)
+		if casp((*unsafe.Pointer)(addr), old, v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func atomicstore(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if cas(addr, old, v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func cas64(addr *uint64, old, new uint64) bool {
+	var ok bool
+	onM(func() {
+		lock(addrLock(addr))
+		if *addr == old {
+			*addr = new
+			ok = true
+		}
+		unlock(addrLock(addr))
+	})
+	return ok
+}
+
+//go:nosplit
+func xadd64(addr *uint64, delta int64) uint64 {
+	var r uint64
+	onM(func() {
+		lock(addrLock(addr))
+		r = *addr + uint64(delta)
+		*addr = r
+		unlock(addrLock(addr))
+	})
+	return r
+}
+
+//go:nosplit
+func xchg64(addr *uint64, v uint64) uint64 {
+	var r uint64
+	onM(func() {
+		lock(addrLock(addr))
+		r = *addr
+		*addr = v
+		unlock(addrLock(addr))
+	})
+	return r
+}
+
+//go:nosplit
+func atomicload64(addr *uint64) uint64 {
+	var r uint64
+	onM(func() {
+		lock(addrLock(addr))
+		r = *addr
+		unlock(addrLock(addr))
+	})
+	return r
+}
+
+//go:nosplit
+func atomicstore64(addr *uint64, v uint64) {
+	onM(func() {
+		lock(addrLock(addr))
+		*addr = v
+		unlock(addrLock(addr))
+	})
+}
+
+//go:nosplit
+func atomicor8(addr *uint8, v uint8) {
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8) // little endian
+	for {
+		old := *addr32
+		if cas(addr32, old, old|word) {
+			return
+		}
+	}
+}
diff --git a/src/runtime/cgo/asm_386.s b/src/runtime/cgo/asm_386.s
new file mode 100644
index 000000000..a895083f1
--- /dev/null
+++ b/src/runtime/cgo/asm_386.s
@@ -0,0 +1,31 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	PUSHL	BX
+	PUSHL	SI
+	PUSHL	DI
+	
+	SUBL	$8, SP
+	MOVL	16(BP), AX
+	MOVL	AX, 4(SP)
+	MOVL	12(BP), AX
+	MOVL	AX, 0(SP)
+	MOVL	8(BP), AX
+	CALL	AX
+	ADDL	$8, SP
+	
+	POPL	DI
+	POPL	SI
+	POPL	BX
+	POPL	BP
+	RET
diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
new file mode 100644
index 000000000..6095bd133
--- /dev/null
+++ b/src/runtime/cgo/asm_amd64.s
@@ -0,0 +1,47 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),NOSPLIT,$0
+	SUBQ	$0x58, SP	/* keeps stack pointer 32-byte aligned */
+	MOVQ	BX, 0x10(SP)
+	MOVQ	BP, 0x18(SP)
+	MOVQ	R12, 0x20(SP)
+	MOVQ	R13, 0x28(SP)
+	MOVQ	R14, 0x30(SP)
+	MOVQ	R15, 0x38(SP)
+
+#ifdef GOOS_windows
+	// Win64 save RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15
+	MOVQ	DI, 0x40(SP)
+	MOVQ	SI, 0x48(SP)
+
+	MOVQ	DX, 0(SP)	/* arg */
+	MOVQ	R8, 8(SP)	/* argsize (includes padding) */
+	
+	CALL	CX	/* fn */
+	
+	MOVQ	0x40(SP), DI
+	MOVQ	0x48(SP), SI
+#else
+	MOVQ	SI, 0(SP)	/* arg */
+	MOVQ	DX, 8(SP)	/* argsize (includes padding) */
+
+	CALL	DI	/* fn */
+#endif
+
+	MOVQ	0x10(SP), BX
+	MOVQ	0x18(SP), BP
+	MOVQ	0x20(SP), R12
+	MOVQ	0x28(SP), R13
+	MOVQ	0x30(SP), R14
+	MOVQ	0x38(SP), R15
+	
+	ADDQ	$0x58, SP
+	RET
diff --git a/src/runtime/cgo/asm_arm.s b/src/runtime/cgo/asm_arm.s
new file mode 100644
index 000000000..6e57432e3
--- /dev/null
+++ b/src/runtime/cgo/asm_arm.s
@@ -0,0 +1,24 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),NOSPLIT,$-4
+	/* 
+	 * We still need to save all callee save register as before, and then
+	 *  push 2 args for fn (R1 and R2).
+	 * Also note that at procedure entry in 5c/5g world, 4(R13) will be the
+	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
+	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
+	 *  nevertheless.
+	 */
+	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	BL	runtime·load_g(SB)
+	MOVW	PC, R14
+	MOVW	0(R13), PC
+	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, PC]
diff --git a/src/runtime/cgo/asm_nacl_amd64p32.s b/src/runtime/cgo/asm_nacl_amd64p32.s
new file mode 100644
index 000000000..eb92014ed
--- /dev/null
+++ b/src/runtime/cgo/asm_nacl_amd64p32.s
@@ -0,0 +1,13 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),NOSPLIT,$0
+	INT $3
+	RET
diff --git a/src/runtime/cgo/callbacks.c b/src/runtime/cgo/callbacks.c
new file mode 100644
index 000000000..282beeea8
--- /dev/null
+++ b/src/runtime/cgo/callbacks.c
@@ -0,0 +1,83 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "../runtime.h"
+#include "../cgocall.h"
+#include "textflag.h"
+
+// These utility functions are available to be called from code
+// compiled with gcc via crosscall2.
+
+// The declaration of crosscall2 is:
+//   void crosscall2(void (*fn)(void *, int), void *, int);
+// 
+// We need to export the symbol crosscall2 in order to support
+// callbacks from shared libraries. This applies regardless of
+// linking mode.
+#pragma cgo_export_static crosscall2
+#pragma cgo_export_dynamic crosscall2
+
+// Allocate memory.  This allocates the requested number of bytes in
+// memory controlled by the Go runtime.  The allocated memory will be
+// zeroed.  You are responsible for ensuring that the Go garbage
+// collector can see a pointer to the allocated memory for as long as
+// it is valid, e.g., by storing a pointer in a local variable in your
+// C function, or in memory allocated by the Go runtime.  If the only
+// pointers are in a C global variable or in memory allocated via
+// malloc, then the Go garbage collector may collect the memory.
+
+// Call like this in code compiled with gcc:
+//   struct { size_t len; void *ret; } a;
+//   a.len = /* number of bytes to allocate */;
+//   crosscall2(_cgo_allocate, &a, sizeof a);
+//   /* Here a.ret is a pointer to the allocated memory.  */
+
+void runtime·_cgo_allocate_internal(void);
+
+#pragma cgo_export_static _cgo_allocate
+#pragma cgo_export_dynamic _cgo_allocate
+#pragma textflag NOSPLIT
+void
+_cgo_allocate(void *a, int32 n)
+{
+	runtime·cgocallback((void(*)(void))runtime·_cgo_allocate_internal, a, n);
+}
+
+// Panic.  The argument is converted into a Go string.
+
+// Call like this in code compiled with gcc:
+//   struct { const char *p; } a;
+//   a.p = /* string to pass to panic */;
+//   crosscall2(_cgo_panic, &a, sizeof a);
+//   /* The function call will not return.  */
+
+void runtime·_cgo_panic_internal(void);
+
+#pragma cgo_export_static _cgo_panic
+#pragma cgo_export_dynamic _cgo_panic
+#pragma textflag NOSPLIT
+void
+_cgo_panic(void *a, int32 n)
+{
+	runtime·cgocallback((void(*)(void))runtime·_cgo_panic_internal, a, n);
+}
+
+#pragma cgo_import_static x_cgo_init
+extern void x_cgo_init(G*);
+void (*_cgo_init)(G*) = x_cgo_init;
+
+#pragma cgo_import_static x_cgo_malloc
+extern void x_cgo_malloc(void*);
+void (*_cgo_malloc)(void*) = x_cgo_malloc;
+
+#pragma cgo_import_static x_cgo_free
+extern void x_cgo_free(void*);
+void (*_cgo_free)(void*) = x_cgo_free;
+
+#pragma cgo_import_static x_cgo_thread_start
+extern void x_cgo_thread_start(void*);
+void (*_cgo_thread_start)(void*) = x_cgo_thread_start;
+
+#pragma cgo_export_static _cgo_topofstack
+#pragma cgo_export_dynamic _cgo_topofstack
diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
new file mode 100644
index 000000000..8528692f7
--- /dev/null
+++ b/src/runtime/cgo/cgo.go
@@ -0,0 +1,26 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package cgo contains runtime support for code generated
+by the cgo tool.  See the documentation for the cgo command
+for details on using cgo.
+*/
+package cgo
+
+/*
+
+#cgo darwin LDFLAGS: -lpthread
+#cgo dragonfly LDFLAGS: -lpthread
+#cgo freebsd LDFLAGS: -lpthread
+#cgo android LDFLAGS: -llog
+#cgo !android,linux LDFLAGS: -lpthread
+#cgo netbsd LDFLAGS: -lpthread
+#cgo openbsd LDFLAGS: -lpthread
+#cgo windows LDFLAGS: -lm -mthreads
+
+#cgo CFLAGS: -Wall -Werror
+
+*/
+import "C"
diff --git a/src/runtime/cgo/dragonfly.c b/src/runtime/cgo/dragonfly.c
new file mode 100644
index 000000000..c233c8ba9
--- /dev/null
+++ b/src/runtime/cgo/dragonfly.c
@@ -0,0 +1,19 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build dragonfly
+
+#include "textflag.h"
+
+// Supply environ and __progname, because we don't
+// link against the standard DragonFly crt0.o and the
+// libc dynamic library needs them.
+
+#pragma dataflag NOPTR
+char *environ[1];
+#pragma dataflag NOPTR
+char *__progname;
+
+#pragma dynexport environ environ
+#pragma dynexport __progname __progname
diff --git a/src/runtime/cgo/freebsd.c b/src/runtime/cgo/freebsd.c
new file mode 100644
index 000000000..4876b2abe
--- /dev/null
+++ b/src/runtime/cgo/freebsd.c
@@ -0,0 +1,19 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build freebsd
+
+#include "textflag.h"
+
+// Supply environ and __progname, because we don't
+// link against the standard FreeBSD crt0.o and the
+// libc dynamic library needs them.
+
+#pragma dataflag NOPTR
+char *environ[1];
+#pragma dataflag NOPTR
+char *__progname;
+
+#pragma dynexport environ environ
+#pragma dynexport __progname __progname
diff --git a/src/runtime/cgo/gcc_386.S b/src/runtime/cgo/gcc_386.S
new file mode 100644
index 000000000..bf4142793
--- /dev/null
+++ b/src/runtime/cgo/gcc_386.S
@@ -0,0 +1,45 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Apple still insists on underscore prefixes for C function names.
+ */
+#if defined(__APPLE__) || defined(_WIN32)
+#define EXT(s) _##s
+#else
+#define EXT(s) s
+#endif
+
+/*
+ * void crosscall_386(void (*fn)(void))
+ *
+ * Calling into the 8c tool chain, where all registers are caller save.
+ * Called from standard x86 ABI, where %ebp, %ebx, %esi,
+ * and %edi are callee-save, so they must be saved explicitly.
+ */
+.globl EXT(crosscall_386)
+EXT(crosscall_386):
+	pushl %ebp
+	movl %esp, %ebp
+	pushl %ebx
+	pushl %esi
+	pushl %edi
+
+	movl 8(%ebp), %eax	/* fn */
+	call *%eax
+
+	popl %edi
+	popl %esi
+	popl %ebx
+	popl %ebp
+	ret
+
+.globl EXT(__stack_chk_fail_local)
+EXT(__stack_chk_fail_local):
+1:
+	jmp 1b
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
diff --git a/src/runtime/cgo/gcc_amd64.S b/src/runtime/cgo/gcc_amd64.S
new file mode 100644
index 000000000..32d0200cf
--- /dev/null
+++ b/src/runtime/cgo/gcc_amd64.S
@@ -0,0 +1,48 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Apple still insists on underscore prefixes for C function names.
+ */
+#if defined(__APPLE__)
+#define EXT(s) _##s
+#else
+#define EXT(s) s
+#endif
+
+/*
+ * void crosscall_amd64(void (*fn)(void))
+ *
+ * Calling into the 6c tool chain, where all registers are caller save.
+ * Called from standard x86-64 ABI, where %rbx, %rbp, %r12-%r15
+ * are callee-save so they must be saved explicitly.
+ * The standard x86-64 ABI passes the three arguments m, g, fn
+ * in %rdi, %rsi, %rdx.
+ */
+.globl EXT(crosscall_amd64)
+EXT(crosscall_amd64):
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+
+#if defined(_WIN64)
+	call *%rcx	/* fn */
+#else
+	call *%rdi	/* fn */
+#endif
+
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	ret
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
diff --git a/src/runtime/cgo/gcc_android.c b/src/runtime/cgo/gcc_android.c
new file mode 100644
index 000000000..be2772568
--- /dev/null
+++ b/src/runtime/cgo/gcc_android.c
@@ -0,0 +1,31 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdarg.h>
+#include <android/log.h>
+#include "libcgo.h"
+
+void
+fatalf(const char* format, ...)
+{
+	va_list ap;
+
+	// Write to both stderr and logcat.
+	//
+	// When running from an .apk, /dev/stderr and /dev/stdout
+	// redirect to /dev/null. And when running a test binary
+	// via adb shell, it's easy to miss logcat.
+
+	fprintf(stderr, "runtime/cgo: ");
+	va_start(ap, format);
+	vfprintf(stderr, format, ap);
+	va_end(ap);
+	fprintf(stderr, "\n");
+
+	va_start(ap, format);
+	__android_log_vprint(ANDROID_LOG_FATAL, "runtime/cgo", format, ap);
+	va_end(ap);
+
+	abort();
+}
diff --git a/src/runtime/cgo/gcc_android_arm.c b/src/runtime/cgo/gcc_android_arm.c
new file mode 100644
index 000000000..07f7e72e3
--- /dev/null
+++ b/src/runtime/cgo/gcc_android_arm.c
@@ -0,0 +1,43 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <sys/limits.h>
+#include "libcgo.h"
+
+#define magic1 (0x23581321U)
+
+// PTHREAD_KEYS_MAX has been added to sys/limits.h at head in bionic:
+// https://android.googlesource.com/platform/bionic/+/master/libc/include/sys/limits.h
+// TODO(crawshaw): remove this definition when a new NDK is released.
+#define PTHREAD_KEYS_MAX 128
+
+// inittls allocates a thread-local storage slot for g.
+//
+// It finds the first available slot using pthread_key_create and uses
+// it as the offset value for runtime.tlsg.
+static void
+inittls(void **tlsg, void **tlsbase)
+{
+	pthread_key_t k;
+	int i, err;
+
+	err = pthread_key_create(&k, nil);
+	if(err != 0) {
+		fatalf("pthread_key_create failed: %d", err);
+	}
+	pthread_setspecific(k, (void*)magic1);
+	for (i=0; i<PTHREAD_KEYS_MAX; i++) {
+		if (*(tlsbase+i) == (void*)magic1) {
+			*tlsg = (void*)(i*sizeof(void *));
+			pthread_setspecific(k, 0);
+			return;
+		}
+	}
+	fatalf("could not find pthread key");
+}
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase) = inittls;
diff --git a/src/runtime/cgo/gcc_arm.S b/src/runtime/cgo/gcc_arm.S
new file mode 100644
index 000000000..d5833bfad
--- /dev/null
+++ b/src/runtime/cgo/gcc_arm.S
@@ -0,0 +1,42 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Apple still insists on underscore prefixes for C function names.
+ */
+#if defined(__APPLE__)
+#define EXT(s) _##s
+#else
+#define EXT(s) s
+#endif
+
+/*
+ * void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
+ *
+ * Calling into the 5c tool chain, where all registers are caller save.
+ * Called from standard ARM EABI, where r4-r11 are callee-save, so they
+ * must be saved explicitly.
+ */
+.globl EXT(crosscall_arm1)
+EXT(crosscall_arm1):
+	push {r4, r5, r6, r7, r8, r9, r10, r11, ip, lr}
+	mov r4, r0
+	mov r5, r1
+	mov r0, r2
+
+	// Because the assembler might target an earlier revision of the ISA
+	// by default, we encode BLX as a .word.
+	.word 0xe12fff35 // blx r5 // setg(g)
+	.word 0xe12fff34 // blx r4 // fn()
+
+	pop {r4, r5, r6, r7, r8, r9, r10, r11, ip, pc}
+
+.globl EXT(__stack_chk_fail_local)
+EXT(__stack_chk_fail_local):
+1:
+	b 1b
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/src/runtime/cgo/gcc_darwin_386.c b/src/runtime/cgo/gcc_darwin_386.c
new file mode 100644
index 000000000..6668ba4a2
--- /dev/null
+++ b/src/runtime/cgo/gcc_darwin_386.c
@@ -0,0 +1,148 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h> /* for strerror */
+#include <pthread.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static pthread_key_t k1;
+
+#define magic1 (0x23581321U)
+
+static void
+inittls(void)
+{
+	uint32 x;
+	pthread_key_t tofree[128], k;
+	int i, ntofree;
+
+	/*
+	 * Allocate thread-local storage slot for g.
+	 * The key numbers start at 0x100, and we expect to be
+	 * one of the early calls to pthread_key_create, so we
+	 * should be able to get a pretty low number.
+	 *
+	 * In Darwin/386 pthreads, %gs points at the thread
+	 * structure, and each key is an index into the thread-local
+	 * storage array that begins at offset 0x48 within in that structure.
+	 * It may happen that we are not quite the first function to try
+	 * to allocate thread-local storage keys, so instead of depending
+	 * on getting 0x100, we try for 0x108, allocating keys until
+	 * we get the one we want and then freeing the ones we didn't want.
+	 *
+	 * Thus the final offset to use in %gs references is
+	 * 0x48+4*0x108 = 0x468.
+	 *
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g.
+	 * Known to ../../../liblink/sym.c:/468
+	 * and to ../sys_darwin_386.s:/468
+	 *
+	 * This is truly disgusting and a bit fragile, but taking care
+	 * of it here protects the rest of the system from damage.
+	 * The alternative would be to use a global variable that
+	 * held the offset and refer to that variable each time we
+	 * need a %gs variable (g).  That approach would
+	 * require an extra instruction and memory reference in
+	 * every stack growth prolog and would also require
+	 * rewriting the code that 8c generates for extern registers.
+	 *
+	 * Things get more disgusting on OS X 10.7 Lion.
+	 * The 0x48 base mentioned above is the offset of the tsd
+	 * array within the per-thread structure on Leopard and Snow Leopard.
+	 * On Lion, the base moved a little, so while the math above
+	 * still applies, the base is different.  Thus, we cannot
+	 * look for specific key values if we want to build binaries
+	 * that run on both systems.  Instead, forget about the
+	 * specific key values and just allocate and initialize per-thread
+	 * storage until we find a key that writes to the memory location
+	 * we want.  Then keep that key.
+	 */
+	ntofree = 0;
+	for(;;) {
+		if(pthread_key_create(&k, nil) < 0) {
+			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
+			abort();
+		}
+		pthread_setspecific(k, (void*)magic1);
+		asm volatile("movl %%gs:0x468, %0" : "=r"(x));
+		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
+	}
+
+	/*
+	 * We got the key we wanted.  Free the others.
+	 */
+	for(i=0; i<ntofree; i++)
+		pthread_key_delete(tofree[i]);
+}
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	inittls();
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	pthread_setspecific(k1, (void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_darwin_amd64.c b/src/runtime/cgo/gcc_darwin_amd64.c
new file mode 100644
index 000000000..dc679acab
--- /dev/null
+++ b/src/runtime/cgo/gcc_darwin_amd64.c
@@ -0,0 +1,119 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h> /* for strerror */
+#include <pthread.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static pthread_key_t k1;
+
+#define magic1 (0x23581321345589ULL)
+
+static void
+inittls(void)
+{
+	uint64 x;
+	pthread_key_t tofree[128], k;
+	int i, ntofree;
+
+	/*
+	 * Same logic, code as darwin_386.c:/inittls, except that words
+	 * are 8 bytes long now, and the thread-local storage starts
+	 * at 0x60 on Leopard / Snow Leopard. So the offset is
+	 * 0x60+8*0x108 = 0x8a0.
+	 *
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g.
+	 * Known to ../../../liblink/sym.c:/8a0
+	 * and to ../sys_darwin_amd64.s:/8a0
+	 *
+	 * As disgusting as on the 386; same justification.
+	 */
+	ntofree = 0;
+	for(;;) {
+		if(pthread_key_create(&k, nil) < 0) {
+			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
+			abort();
+		}
+		pthread_setspecific(k, (void*)magic1);
+		asm volatile("movq %%gs:0x8a0, %0" : "=r"(x));
+		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
+	}
+
+	/*
+	 * We got the key we wanted.  Free the others.
+	 */
+	for(i=0; i<ntofree; i++)
+		pthread_key_delete(tofree[i]);
+}
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	inittls();
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	pthread_setspecific(k1, (void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_dragonfly_386.c b/src/runtime/cgo/gcc_dragonfly_386.c
new file mode 100644
index 000000000..074418f77
--- /dev/null
+++ b/src/runtime/cgo/gcc_dragonfly_386.c
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <sys/signalvar.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	SIGFILLSET(ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_dragonfly_amd64.c b/src/runtime/cgo/gcc_dragonfly_amd64.c
new file mode 100644
index 000000000..f79f652e4
--- /dev/null
+++ b/src/runtime/cgo/gcc_dragonfly_amd64.c
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <sys/signalvar.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	SIGFILLSET(ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_fatalf.c b/src/runtime/cgo/gcc_fatalf.c
new file mode 100644
index 000000000..21c1acfaa
--- /dev/null
+++ b/src/runtime/cgo/gcc_fatalf.c
@@ -0,0 +1,23 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !android,linux
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "libcgo.h"
+
+void
+fatalf(const char* format, ...)
+{
+	va_list ap;
+
+	fprintf(stderr, "runtime/cgo: ");
+	va_start(ap, format);
+	vfprintf(stderr, format, ap);
+	va_end(ap);
+	fprintf(stderr, "\n");
+	abort();
+}
diff --git a/src/runtime/cgo/gcc_freebsd_386.c b/src/runtime/cgo/gcc_freebsd_386.c
new file mode 100644
index 000000000..074418f77
--- /dev/null
+++ b/src/runtime/cgo/gcc_freebsd_386.c
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <sys/signalvar.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	SIGFILLSET(ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_freebsd_amd64.c b/src/runtime/cgo/gcc_freebsd_amd64.c
new file mode 100644
index 000000000..f79f652e4
--- /dev/null
+++ b/src/runtime/cgo/gcc_freebsd_amd64.c
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <sys/signalvar.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	SIGFILLSET(ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_freebsd_arm.c b/src/runtime/cgo/gcc_freebsd_arm.c
new file mode 100644
index 000000000..2a86a9117
--- /dev/null
+++ b/src/runtime/cgo/gcc_freebsd_arm.c
@@ -0,0 +1,82 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <machine/sysarch.h>
+#include <sys/signalvar.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+#ifdef ARM_TP_ADDRESS
+// ARM_TP_ADDRESS is (ARM_VECTORS_HIGH + 0x1000) or 0xffff1000
+// and is known to runtime.read_tls_fallback. Verify it with
+// cpp.
+#if ARM_TP_ADDRESS != 0xffff1000
+#error Wrong ARM_TP_ADDRESS!
+#endif
+#endif
+
+static void *threadentry(void*);
+
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	SIGFILLSET(ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_linux_386.c b/src/runtime/cgo/gcc_linux_386.c
new file mode 100644
index 000000000..9801c87bd
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_386.c
@@ -0,0 +1,72 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_linux_amd64.c b/src/runtime/cgo/gcc_linux_amd64.c
new file mode 100644
index 000000000..275d5ddac
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_amd64.c
@@ -0,0 +1,67 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h> // strerror
+#include <signal.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G* g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_linux_arm.c b/src/runtime/cgo/gcc_linux_arm.c
new file mode 100644
index 000000000..7d4b4d6d4
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_arm.c
@@ -0,0 +1,73 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase);
+void (*setg_gcc)(void*);
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	if (x_cgo_inittls) {
+		x_cgo_inittls(tlsg, tlsbase);
+	}
+}
diff --git a/src/runtime/cgo/gcc_netbsd_386.c b/src/runtime/cgo/gcc_netbsd_386.c
new file mode 100644
index 000000000..2505e6dc7
--- /dev/null
+++ b/src/runtime/cgo/gcc_netbsd_386.c
@@ -0,0 +1,69 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_netbsd_amd64.c b/src/runtime/cgo/gcc_netbsd_amd64.c
new file mode 100644
index 000000000..8f646502d
--- /dev/null
+++ b/src/runtime/cgo/gcc_netbsd_amd64.c
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_netbsd_arm.c b/src/runtime/cgo/gcc_netbsd_arm.c
new file mode 100644
index 000000000..7a98c0de2
--- /dev/null
+++ b/src/runtime/cgo/gcc_netbsd_arm.c
@@ -0,0 +1,66 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_openbsd_386.c b/src/runtime/cgo/gcc_openbsd_386.c
new file mode 100644
index 000000000..582e943f3
--- /dev/null
+++ b/src/runtime/cgo/gcc_openbsd_386.c
@@ -0,0 +1,158 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+// TCB_SIZE is sizeof(struct thread_control_block),
+// as defined in /usr/src/lib/librthread/tcb.h
+#define TCB_SIZE (4 * sizeof(void *))
+#define TLS_SIZE (2 * sizeof(void *))
+
+void *__get_tcb(void);
+void __set_tcb(void *);
+
+static int (*sys_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg);
+
+struct thread_args {
+	void *(*func)(void *);
+	void *arg;
+};
+
+static void
+tcb_fixup(int mainthread)
+{
+	void *newtcb, *oldtcb;
+
+	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
+	// we need to allocate our own TLS space while preserving the existing
+	// TCB that has been setup via librthread.
+
+	newtcb = malloc(TCB_SIZE + TLS_SIZE);
+	if(newtcb == NULL)
+		abort();
+
+	// The signal trampoline expects the TLS slots to be zeroed.
+	bzero(newtcb, TLS_SIZE);
+
+	oldtcb = __get_tcb();
+	bcopy(oldtcb, newtcb + TLS_SIZE, TCB_SIZE);
+	__set_tcb(newtcb + TLS_SIZE);
+
+	// NOTE(jsing, minux): we can't free oldtcb without causing double-free
+	// problem. so newtcb will be memory leaks. Get rid of this when OpenBSD
+	// has proper support for PT_TLS.
+}
+
+static void *
+thread_start_wrapper(void *arg)
+{
+	struct thread_args args = *(struct thread_args *)arg;
+
+	free(arg);
+	tcb_fixup(0);
+
+	return args.func(args.arg);
+}
+
+int
+pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg)
+{
+	struct thread_args *p;
+
+	p = malloc(sizeof(*p));
+	if(p == NULL) {
+		errno = ENOMEM;
+		return -1;
+	}
+	p->func = start_routine;
+	p->arg = arg;
+
+	return sys_pthread_create(thread, attr, thread_start_wrapper, p);
+}
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+	void *handle;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	// Locate symbol for the system pthread_create function.
+	handle = dlopen("libpthread.so", RTLD_LAZY);
+	if(handle == NULL) {
+		fprintf(stderr, "dlopen: failed to load libpthread: %s\n", dlerror());
+		abort();
+	}
+	sys_pthread_create = dlsym(handle, "pthread_create");
+	if(sys_pthread_create == NULL) {
+		fprintf(stderr, "dlsym: failed to find pthread_create: %s\n", dlerror());
+		abort();
+	}
+	dlclose(handle);
+
+	tcb_fixup(1);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = sys_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	tcb_fixup(0);
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_openbsd_amd64.c b/src/runtime/cgo/gcc_openbsd_amd64.c
new file mode 100644
index 000000000..35b359bba
--- /dev/null
+++ b/src/runtime/cgo/gcc_openbsd_amd64.c
@@ -0,0 +1,159 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+// TCB_SIZE is sizeof(struct thread_control_block),
+// as defined in /usr/src/lib/librthread/tcb.h
+#define TCB_SIZE (4 * sizeof(void *))
+#define TLS_SIZE (2 * sizeof(void *))
+
+void *__get_tcb(void);
+void __set_tcb(void *);
+
+static int (*sys_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg);
+
+struct thread_args {
+	void *(*func)(void *);
+	void *arg;
+};
+
+static void
+tcb_fixup(int mainthread)
+{
+	void *newtcb, *oldtcb;
+
+	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
+	// we need to allocate our own TLS space while preserving the existing
+	// TCB that has been setup via librthread.
+
+	newtcb = malloc(TCB_SIZE + TLS_SIZE);
+	if(newtcb == NULL)
+		abort();
+
+	// The signal trampoline expects the TLS slots to be zeroed.
+	bzero(newtcb, TLS_SIZE);
+
+	oldtcb = __get_tcb();
+	bcopy(oldtcb, newtcb + TLS_SIZE, TCB_SIZE);
+	__set_tcb(newtcb + TLS_SIZE);
+
+	// NOTE(jsing, minux): we can't free oldtcb without causing double-free
+	// problem. so newtcb will be memory leaks. Get rid of this when OpenBSD
+	// has proper support for PT_TLS.
+}
+
+static void *
+thread_start_wrapper(void *arg)
+{
+	struct thread_args args = *(struct thread_args *)arg;
+
+	free(arg);
+	tcb_fixup(0);
+
+	return args.func(args.arg);
+}
+
+int
+pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg)
+{
+	struct thread_args *p;
+
+	p = malloc(sizeof(*p));
+	if(p == NULL) {
+		errno = ENOMEM;
+		return -1;
+	}
+	p->func = start_routine;
+	p->arg = arg;
+
+	return sys_pthread_create(thread, attr, thread_start_wrapper, p);
+}
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+	void *handle;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	// Locate symbol for the system pthread_create function.
+	handle = dlopen("libpthread.so", RTLD_LAZY);
+	if(handle == NULL) {
+		fprintf(stderr, "dlopen: failed to load libpthread: %s\n", dlerror());
+		abort();
+	}
+	sys_pthread_create = dlsym(handle, "pthread_create");
+	if(sys_pthread_create == NULL) {
+		fprintf(stderr, "dlsym: failed to find pthread_create: %s\n", dlerror());
+		abort();
+	}
+	dlclose(handle);
+
+	tcb_fixup(1);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = sys_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	tcb_fixup(0);
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	/*
+	 * Set specific keys.
+	 */
+	setg_gcc((void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_setenv.c b/src/runtime/cgo/gcc_setenv.c
new file mode 100644
index 000000000..af0fc5d8d
--- /dev/null
+++ b/src/runtime/cgo/gcc_setenv.c
@@ -0,0 +1,23 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd
+
+#include "libcgo.h"
+
+#include <stdlib.h>
+
+/* Stub for calling setenv */
+void
+x_cgo_setenv(char **arg)
+{
+	setenv(arg[0], arg[1], 1);
+}
+
+/* Stub for calling unsetenv */
+void
+x_cgo_unsetenv(char *arg)
+{
+	unsetenv(arg);
+}
diff --git a/src/runtime/cgo/gcc_util.c b/src/runtime/cgo/gcc_util.c
new file mode 100644
index 000000000..143734e94
--- /dev/null
+++ b/src/runtime/cgo/gcc_util.c
@@ -0,0 +1,47 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "libcgo.h"
+
+/* Stub for calling malloc from Go */
+void
+x_cgo_malloc(void *p)
+{
+	struct a {
+		long long n;
+		void *ret;
+	} *a = p;
+
+	a->ret = malloc(a->n);
+	if(a->ret == NULL && a->n == 0)
+		a->ret = malloc(1);
+}
+
+/* Stub for calling free from Go */
+void
+x_cgo_free(void *p)
+{
+	struct a {
+		void *arg;
+	} *a = p;
+
+	free(a->arg);
+}
+
+/* Stub for creating a new thread */
+void
+x_cgo_thread_start(ThreadStart *arg)
+{
+	ThreadStart *ts;
+
+	/* Make our own copy that can persist after we return. */
+	ts = malloc(sizeof *ts);
+	if(ts == nil) {
+		fprintf(stderr, "runtime/cgo: out of memory in thread_start\n");
+		abort();
+	}
+	*ts = *arg;
+
+	_cgo_sys_thread_start(ts);	/* OS-dependent half */
+}
diff --git a/src/runtime/cgo/gcc_windows_386.c b/src/runtime/cgo/gcc_windows_386.c
new file mode 100644
index 000000000..acd038ccd
--- /dev/null
+++ b/src/runtime/cgo/gcc_windows_386.c
@@ -0,0 +1,61 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "libcgo.h"
+
+static void threadentry(void*);
+
+/* 1MB is default stack size for 32-bit Windows.
+   Allocation granularity on Windows is typically 64 KB.
+   The constant is also hardcoded in cmd/ld/pe.c (keep synchronized). */
+#define STACKSIZE (1*1024*1024)
+
+void
+x_cgo_init(G *g)
+{
+	int tmp;
+	g->stacklo = (uintptr)&tmp - STACKSIZE + 8*1024;
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	uintptr_t thandle;
+
+	thandle = _beginthread(threadentry, 0, ts);
+	if(thandle == -1) {
+		fprintf(stderr, "runtime: failed to create new OS thread (%d)\n", errno);
+		abort();
+	}
+}
+
+static void
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackhi = (uintptr)&ts;
+	ts.g->stacklo = (uintptr)&ts - STACKSIZE + 8*1024;
+
+	/*
+	 * Set specific keys in thread local storage.
+	 */
+	asm volatile (
+		"movl %0, %%fs:0x14\n"	// MOVL tls0, 0x14(FS)
+		"movl %%fs:0x14, %%eax\n"	// MOVL 0x14(FS), tmp
+		"movl %1, 0(%%eax)\n"	// MOVL g, 0(FS)
+		:: "r"(ts.tls), "r"(ts.g) : "%eax"
+	);
+	
+	crosscall_386(ts.fn);
+}
diff --git a/src/runtime/cgo/gcc_windows_amd64.c b/src/runtime/cgo/gcc_windows_amd64.c
new file mode 100644
index 000000000..ce7e06b3d
--- /dev/null
+++ b/src/runtime/cgo/gcc_windows_amd64.c
@@ -0,0 +1,61 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define WIN64_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "libcgo.h"
+
+static void threadentry(void*);
+
+/* 2MB is default stack size for 64-bit Windows.
+   Allocation granularity on Windows is typically 64 KB.
+   The constant is also hardcoded in cmd/ld/pe.c (keep synchronized). */
+#define STACKSIZE (2*1024*1024)
+
+void
+x_cgo_init(G *g)
+{
+	int tmp;
+	g->stacklo = (uintptr)&tmp - STACKSIZE + 8*1024;
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	uintptr_t thandle;
+
+	thandle = _beginthread(threadentry, 0, ts);
+	if(thandle == -1) {
+		fprintf(stderr, "runtime: failed to create new OS thread (%d)\n", errno);
+		abort();
+	}
+}
+
+static void
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackhi = (uintptr)&ts;
+	ts.g->stacklo = (uintptr)&ts - STACKSIZE + 8*1024;
+
+	/*
+	 * Set specific keys in thread local storage.
+	 */
+	asm volatile (
+	  "movq %0, %%gs:0x28\n"	// MOVL tls0, 0x28(GS)
+	  "movq %%gs:0x28, %%rax\n" // MOVQ 0x28(GS), tmp
+	  "movq %1, 0(%%rax)\n" // MOVQ g, 0(GS)
+	  :: "r"(ts.tls), "r"(ts.g) : "%rax"
+	);
+
+	crosscall_amd64(ts.fn);
+}
diff --git a/src/runtime/cgo/iscgo.c b/src/runtime/cgo/iscgo.c
new file mode 100644
index 000000000..0907a1958
--- /dev/null
+++ b/src/runtime/cgo/iscgo.c
@@ -0,0 +1,15 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The runtime package contains an uninitialized definition
+// for runtime·iscgo.  Override it to tell the runtime we're here.
+// There are various function pointers that should be set too,
+// but those depend on dynamic linker magic to get initialized
+// correctly, and sometimes they break.  This variable is a
+// backup: it depends only on old C style static linking rules.
+
+#include "../runtime.h"
+
+bool runtime·iscgo = 1;
+uint32 runtime·needextram = 1;  // create an extra M on first cgo call
diff --git a/src/runtime/cgo/libcgo.h b/src/runtime/cgo/libcgo.h
new file mode 100644
index 000000000..9d918fd7a
--- /dev/null
+++ b/src/runtime/cgo/libcgo.h
@@ -0,0 +1,65 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define nil ((void*)0)
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef uintptr_t uintptr;
+
+/*
+ * The beginning of the per-goroutine structure,
+ * as defined in ../pkg/runtime/runtime.h.
+ * Just enough to edit these two fields.
+ */
+typedef struct G G;
+struct G
+{
+	uintptr stacklo;
+	uintptr stackhi;
+};
+
+/*
+ * Arguments to the _cgo_thread_start call.
+ * Also known to ../pkg/runtime/runtime.h.
+ */
+typedef struct ThreadStart ThreadStart;
+struct ThreadStart
+{
+	G *g;
+	uintptr *tls;
+	void (*fn)(void);
+};
+
+/*
+ * Called by 5c/6c/8c world.
+ * Makes a local copy of the ThreadStart and
+ * calls _cgo_sys_thread_start(ts).
+ */
+extern void (*_cgo_thread_start)(ThreadStart *ts);
+
+/*
+ * Creates the new operating system thread (OS, arch dependent).
+ */
+void _cgo_sys_thread_start(ThreadStart *ts);
+
+/*
+ * Call fn in the 6c world.
+ */
+void crosscall_amd64(void (*fn)(void));
+
+/*
+ * Call fn in the 8c world.
+ */
+void crosscall_386(void (*fn)(void));
+
+/*
+ * Prints error then calls abort. For linux and android.
+ */
+void fatalf(const char* format, ...);
diff --git a/src/runtime/cgo/netbsd.c b/src/runtime/cgo/netbsd.c
new file mode 100644
index 000000000..076cc87f1
--- /dev/null
+++ b/src/runtime/cgo/netbsd.c
@@ -0,0 +1,19 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build netbsd
+
+#include "textflag.h"
+
+// Supply environ and __progname, because we don't
+// link against the standard NetBSD crt0.o and the
+// libc dynamic library needs them.
+
+#pragma dataflag NOPTR
+char *environ[1];
+#pragma dataflag NOPTR
+char *__progname;
+
+#pragma dynexport environ environ
+#pragma dynexport __progname __progname
diff --git a/src/runtime/cgo/openbsd.c b/src/runtime/cgo/openbsd.c
new file mode 100644
index 000000000..476649544
--- /dev/null
+++ b/src/runtime/cgo/openbsd.c
@@ -0,0 +1,27 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build openbsd
+
+#include "textflag.h"
+
+// Supply environ, __progname and __guard_local, because
+// we don't link against the standard OpenBSD crt0.o and
+// the libc dynamic library needs them.
+
+#pragma dataflag NOPTR
+char *environ[1];
+#pragma dataflag NOPTR
+char *__progname;
+long __guard_local;
+
+#pragma dynexport environ environ
+#pragma dynexport __progname __progname
+
+// This is normally marked as hidden and placed in the
+// .openbsd.randomdata section.
+#pragma dynexport __guard_local __guard_local
+
+// We override pthread_create to support PT_TLS.
+#pragma dynexport pthread_create pthread_create
diff --git a/src/runtime/cgo/setenv.c b/src/runtime/cgo/setenv.c
new file mode 100644
index 000000000..76d88cbf1
--- /dev/null
+++ b/src/runtime/cgo/setenv.c
@@ -0,0 +1,13 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd
+
+#pragma cgo_import_static x_cgo_setenv
+#pragma cgo_import_static x_cgo_unsetenv
+
+void x_cgo_setenv(char**);
+void (*runtime·_cgo_setenv)(char**) = x_cgo_setenv;
+void x_cgo_unsetenv(char**);
+void (*runtime·_cgo_unsetenv)(char**) = x_cgo_unsetenv;
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
new file mode 100644
index 000000000..7fd91469e
--- /dev/null
+++ b/src/runtime/cgocall.go
@@ -0,0 +1,279 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Cgo call and callback support.
+//
+// To call into the C function f from Go, the cgo-generated code calls
+// runtime.cgocall(_cgo_Cfunc_f, frame), where _cgo_Cfunc_f is a
+// gcc-compiled function written by cgo.
+//
+// runtime.cgocall (below) locks g to m, calls entersyscall
+// so as not to block other goroutines or the garbage collector,
+// and then calls runtime.asmcgocall(_cgo_Cfunc_f, frame).
+//
+// runtime.asmcgocall (in asm_$GOARCH.s) switches to the m->g0 stack
+// (assumed to be an operating system-allocated stack, so safe to run
+// gcc-compiled code on) and calls _cgo_Cfunc_f(frame).
+//
+// _cgo_Cfunc_f invokes the actual C function f with arguments
+// taken from the frame structure, records the results in the frame,
+// and returns to runtime.asmcgocall.
+//
+// After it regains control, runtime.asmcgocall switches back to the
+// original g (m->curg)'s stack and returns to runtime.cgocall.
+//
+// After it regains control, runtime.cgocall calls exitsyscall, which blocks
+// until this m can run Go code without violating the $GOMAXPROCS limit,
+// and then unlocks g from m.
+//
+// The above description skipped over the possibility of the gcc-compiled
+// function f calling back into Go.  If that happens, we continue down
+// the rabbit hole during the execution of f.
+//
+// To make it possible for gcc-compiled C code to call a Go function p.GoF,
+// cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
+// know about packages).  The gcc-compiled C function f calls GoF.
+//
+// GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
+// (in cgo/gcc_$GOARCH.S, a gcc-compiled assembly file) is a two-argument
+// adapter from the gcc function call ABI to the 6c function call ABI.
+// It is called from gcc to call 6c functions.  In this case it calls
+// _cgoexp_GoF(frame, framesize), still running on m->g0's stack
+// and outside the $GOMAXPROCS limit.  Thus, this code cannot yet
+// call arbitrary Go code directly and must be careful not to allocate
+// memory or use up m->g0's stack.
+//
+// _cgoexp_GoF calls runtime.cgocallback(p.GoF, frame, framesize).
+// (The reason for having _cgoexp_GoF instead of writing a crosscall3
+// to make this call directly is that _cgoexp_GoF, because it is compiled
+// with 6c instead of gcc, can refer to dotted names like
+// runtime.cgocallback and p.GoF.)
+//
+// runtime.cgocallback (in asm_$GOARCH.s) switches from m->g0's
+// stack to the original g (m->curg)'s stack, on which it calls
+// runtime.cgocallbackg(p.GoF, frame, framesize).
+// As part of the stack switch, runtime.cgocallback saves the current
+// SP as m->g0->sched.sp, so that any use of m->g0's stack during the
+// execution of the callback will be done below the existing stack frames.
+// Before overwriting m->g0->sched.sp, it pushes the old value on the
+// m->g0 stack, so that it can be restored later.
+//
+// runtime.cgocallbackg (below) is now running on a real goroutine
+// stack (not an m->g0 stack).  First it calls runtime.exitsyscall, which will
+// block until the $GOMAXPROCS limit allows running this goroutine.
+// Once exitsyscall has returned, it is safe to do things like call the memory
+// allocator or invoke the Go callback function p.GoF.  runtime.cgocallbackg
+// first defers a function to unwind m->g0.sched.sp, so that if p.GoF
+// panics, m->g0.sched.sp will be restored to its old value: the m->g0 stack
+// and the m->curg stack will be unwound in lock step.
+// Then it calls p.GoF.  Finally it pops but does not execute the deferred
+// function, calls runtime.entersyscall, and returns to runtime.cgocallback.
+//
+// After it regains control, runtime.cgocallback switches back to
+// m->g0's stack (the pointer is still in m->g0.sched.sp), restores the old
+// m->g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
+//
+// _cgoexp_GoF immediately returns to crosscall2, which restores the
+// callee-save registers for gcc and returns to GoF, which returns to f.
+
+package runtime
+
+import "unsafe"
+
+// Call from Go to C.
+//go:nosplit
+func cgocall(fn, arg unsafe.Pointer) {
+	cgocall_errno(fn, arg)
+}
+
+//go:nosplit
+func cgocall_errno(fn, arg unsafe.Pointer) int32 {
+	if !iscgo && GOOS != "solaris" && GOOS != "windows" {
+		gothrow("cgocall unavailable")
+	}
+
+	if fn == nil {
+		gothrow("cgocall nil")
+	}
+
+	if raceenabled {
+		racereleasemerge(unsafe.Pointer(&racecgosync))
+	}
+
+	// Create an extra M for callbacks on threads not created by Go on first cgo call.
+	if needextram == 1 && cas(&needextram, 1, 0) {
+		onM(newextram)
+	}
+
+	/*
+	 * Lock g to m to ensure we stay on the same stack if we do a
+	 * cgo callback. Add entry to defer stack in case of panic.
+	 */
+	lockOSThread()
+	mp := getg().m
+	mp.ncgocall++
+	mp.ncgo++
+	defer endcgo(mp)
+
+	/*
+	 * Announce we are entering a system call
+	 * so that the scheduler knows to create another
+	 * M to run goroutines while we are in the
+	 * foreign code.
+	 *
+	 * The call to asmcgocall is guaranteed not to
+	 * split the stack and does not allocate memory,
+	 * so it is safe to call while "in a system call", outside
+	 * the $GOMAXPROCS accounting.
+	 */
+	entersyscall()
+	errno := asmcgocall_errno(fn, arg)
+	exitsyscall()
+
+	return errno
+}
+
+//go:nosplit
+func endcgo(mp *m) {
+	mp.ncgo--
+	if mp.ncgo == 0 {
+		// We are going back to Go and are not in a recursive
+		// call.  Let the GC collect any memory allocated via
+		// _cgo_allocate that is no longer referenced.
+		mp.cgomal = nil
+	}
+
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&racecgosync))
+	}
+
+	unlockOSThread() // invalidates mp
+}
+
+// Helper functions for cgo code.
+
+// Filled by schedinit from corresponding C variables,
+// which are in turn filled in by dynamic linker when Cgo is available.
+var cgoMalloc, cgoFree unsafe.Pointer
+
+func cmalloc(n uintptr) unsafe.Pointer {
+	var args struct {
+		n   uint64
+		ret unsafe.Pointer
+	}
+	args.n = uint64(n)
+	cgocall(cgoMalloc, unsafe.Pointer(&args))
+	if args.ret == nil {
+		gothrow("C malloc failed")
+	}
+	return args.ret
+}
+
+func cfree(p unsafe.Pointer) {
+	cgocall(cgoFree, p)
+}
+
+// Call from C back to Go.
+//go:nosplit
+func cgocallbackg() {
+	gp := getg()
+	if gp != gp.m.curg {
+		println("runtime: bad g in cgocallback")
+		exit(2)
+	}
+
+	// entersyscall saves the caller's SP to allow the GC to trace the Go
+	// stack. However, since we're returning to an earlier stack frame and
+	// need to pair with the entersyscall() call made by cgocall, we must
+	// save syscall* and let reentersyscall restore them.
+	savedsp := unsafe.Pointer(gp.syscallsp)
+	savedpc := gp.syscallpc
+	exitsyscall() // coming out of cgo call
+	cgocallbackg1()
+	// going back to cgo call
+	reentersyscall(savedpc, savedsp)
+}
+
+func cgocallbackg1() {
+	gp := getg()
+	if gp.m.needextram {
+		gp.m.needextram = false
+		onM(newextram)
+	}
+
+	// Add entry to defer stack in case of panic.
+	restore := true
+	defer unwindm(&restore)
+
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&racecgosync))
+	}
+
+	type args struct {
+		fn      *funcval
+		arg     unsafe.Pointer
+		argsize uintptr
+	}
+	var cb *args
+
+	// Location of callback arguments depends on stack frame layout
+	// and size of stack frame of cgocallback_gofunc.
+	sp := gp.m.g0.sched.sp
+	switch GOARCH {
+	default:
+		gothrow("cgocallbackg is unimplemented on arch")
+	case "arm":
+		// On arm, stack frame is two words and there's a saved LR between
+		// SP and the stack frame and between the stack frame and the arguments.
+		cb = (*args)(unsafe.Pointer(sp + 4*ptrSize))
+	case "amd64":
+		// On amd64, stack frame is one word, plus caller PC.
+		cb = (*args)(unsafe.Pointer(sp + 2*ptrSize))
+	case "386":
+		// On 386, stack frame is three words, plus caller PC.
+		cb = (*args)(unsafe.Pointer(sp + 4*ptrSize))
+	}
+
+	// Invoke callback.
+	reflectcall(unsafe.Pointer(cb.fn), unsafe.Pointer(cb.arg), uint32(cb.argsize), 0)
+
+	if raceenabled {
+		racereleasemerge(unsafe.Pointer(&racecgosync))
+	}
+
+	// Do not unwind m->g0->sched.sp.
+	// Our caller, cgocallback, will do that.
+	restore = false
+}
+
+func unwindm(restore *bool) {
+	if !*restore {
+		return
+	}
+	// Restore sp saved by cgocallback during
+	// unwind of g's stack (see comment at top of file).
+	mp := acquirem()
+	sched := &mp.g0.sched
+	switch GOARCH {
+	default:
+		gothrow("unwindm not implemented")
+	case "386", "amd64":
+		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp))
+	case "arm":
+		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 4))
+	}
+	releasem(mp)
+}
+
+// called from assembly
+func badcgocallback() {
+	gothrow("misaligned stack in cgocallback")
+}
+
+// called from (incomplete) assembly
+func cgounimpl() {
+	gothrow("cgo not implemented")
+}
+
+var racecgosync uint64 // represents possible synchronization in C code
diff --git a/src/runtime/cgocall.h b/src/runtime/cgocall.h
new file mode 100644
index 000000000..c87a9cdc5
--- /dev/null
+++ b/src/runtime/cgocall.h
@@ -0,0 +1,13 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Cgo interface.
+ */
+
+void runtime·cgocall(void (*fn)(void*), void*);
+int32 runtime·cgocall_errno(void (*fn)(void*), void*);
+void runtime·cgocallback(void (*fn)(void), void*, uintptr);
+void *runtime·cmalloc(uintptr);
+void runtime·cfree(void*);
diff --git a/src/runtime/cgocallback.go b/src/runtime/cgocallback.go
new file mode 100644
index 000000000..2c8914320
--- /dev/null
+++ b/src/runtime/cgocallback.go
@@ -0,0 +1,40 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// These functions are called from C code via cgo/callbacks.c.
+
+// Allocate memory.  This allocates the requested number of bytes in
+// memory controlled by the Go runtime.  The allocated memory will be
+// zeroed.  You are responsible for ensuring that the Go garbage
+// collector can see a pointer to the allocated memory for as long as
+// it is valid, e.g., by storing a pointer in a local variable in your
+// C function, or in memory allocated by the Go runtime.  If the only
+// pointers are in a C global variable or in memory allocated via
+// malloc, then the Go garbage collector may collect the memory.
+//
+// TODO(rsc,iant): This memory is untyped.
+// Either we need to add types or we need to stop using it.
+
+func _cgo_allocate_internal(len uintptr) unsafe.Pointer {
+	if len == 0 {
+		len = 1
+	}
+	ret := unsafe.Pointer(&make([]unsafe.Pointer, (len+ptrSize-1)/ptrSize)[0])
+	c := new(cgomal)
+	c.alloc = ret
+	gp := getg()
+	c.next = gp.m.cgomal
+	gp.m.cgomal = c
+	return ret
+}
+
+// Panic.
+
+func _cgo_panic_internal(p *byte) {
+	panic(gostringnocopy(p))
+}
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
new file mode 100644
index 000000000..0eb87df74
--- /dev/null
+++ b/src/runtime/chan.go
@@ -0,0 +1,655 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// This file contains the implementation of Go channels.
+
+import "unsafe"
+
+const (
+	maxAlign  = 8
+	hchanSize = unsafe.Sizeof(hchan{}) + uintptr(-int(unsafe.Sizeof(hchan{}))&(maxAlign-1))
+	debugChan = false
+)
+
+// TODO(khr): make hchan.buf an unsafe.Pointer, not a *uint8
+
+func makechan(t *chantype, size int64) *hchan {
+	elem := t.elem
+
+	// compiler checks this but be safe.
+	if elem.size >= 1<<16 {
+		gothrow("makechan: invalid channel element type")
+	}
+	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
+		gothrow("makechan: bad alignment")
+	}
+	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (maxmem-hchanSize)/uintptr(elem.size)) {
+		panic("makechan: size out of range")
+	}
+
+	var c *hchan
+	if elem.kind&kindNoPointers != 0 || size == 0 {
+		// Allocate memory in one call.
+		// Hchan does not contain pointers interesting for GC in this case:
+		// buf points into the same allocation, elemtype is persistent.
+		// SudoG's are referenced from their owning thread so they can't be collected.
+		// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
+		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*uintptr(elem.size), nil, flagNoScan))
+		if size > 0 && elem.size != 0 {
+			c.buf = (*uint8)(add(unsafe.Pointer(c), hchanSize))
+		} else {
+			c.buf = (*uint8)(unsafe.Pointer(c)) // race detector uses this location for synchronization
+		}
+	} else {
+		c = new(hchan)
+		c.buf = (*uint8)(newarray(elem, uintptr(size)))
+	}
+	c.elemsize = uint16(elem.size)
+	c.elemtype = elem
+	c.dataqsiz = uint(size)
+
+	if debugChan {
+		print("makechan: chan=", c, "; elemsize=", elem.size, "; elemalg=", elem.alg, "; dataqsiz=", size, "\n")
+	}
+	return c
+}
+
+// chanbuf(c, i) is pointer to the i'th slot in the buffer.
+func chanbuf(c *hchan, i uint) unsafe.Pointer {
+	return add(unsafe.Pointer(c.buf), uintptr(i)*uintptr(c.elemsize))
+}
+
+// entry point for c <- x from compiled code
+//go:nosplit
+func chansend1(t *chantype, c *hchan, elem unsafe.Pointer) {
+	chansend(t, c, elem, true, getcallerpc(unsafe.Pointer(&t)))
+}
+
+/*
+ * generic single channel send/recv
+ * If block is not nil,
+ * then the protocol will not
+ * sleep but return if it could
+ * not complete.
+ *
+ * sleep can wake up with g.param == nil
+ * when a channel involved in the sleep has
+ * been closed.  it is easiest to loop and re-run
+ * the operation; we'll see that it's now closed.
+ */
+func chansend(t *chantype, c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
+	if raceenabled {
+		raceReadObjectPC(t.elem, ep, callerpc, funcPC(chansend))
+	}
+
+	if c == nil {
+		if !block {
+			return false
+		}
+		gopark(nil, nil, "chan send (nil chan)")
+		gothrow("unreachable")
+	}
+
+	if debugChan {
+		print("chansend: chan=", c, "\n")
+	}
+
+	if raceenabled {
+		racereadpc(unsafe.Pointer(c), callerpc, funcPC(chansend))
+	}
+
+	// Fast path: check for failed non-blocking operation without acquiring the lock.
+	//
+	// After observing that the channel is not closed, we observe that the channel is
+	// not ready for sending. Each of these observations is a single word-sized read
+	// (first c.closed and second c.recvq.first or c.qcount depending on kind of channel).
+	// Because a closed channel cannot transition from 'ready for sending' to
+	// 'not ready for sending', even if the channel is closed between the two observations,
+	// they imply a moment between the two when the channel was both not yet closed
+	// and not ready for sending. We behave as if we observed the channel at that moment,
+	// and report that the send cannot proceed.
+	//
+	// It is okay if the reads are reordered here: if we observe that the channel is not
+	// ready for sending and then observe that it is not closed, that implies that the
+	// channel wasn't closed during the first observation.
+	if !block && c.closed == 0 && ((c.dataqsiz == 0 && c.recvq.first == nil) ||
+		(c.dataqsiz > 0 && c.qcount == c.dataqsiz)) {
+		return false
+	}
+
+	var t0 int64
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+	}
+
+	lock(&c.lock)
+	if c.closed != 0 {
+		unlock(&c.lock)
+		panic("send on closed channel")
+	}
+
+	if c.dataqsiz == 0 { // synchronous channel
+		sg := c.recvq.dequeue()
+		if sg != nil { // found a waiting receiver
+			if raceenabled {
+				racesync(c, sg)
+			}
+			unlock(&c.lock)
+
+			recvg := sg.g
+			if sg.elem != nil {
+				memmove(unsafe.Pointer(sg.elem), ep, uintptr(c.elemsize))
+				sg.elem = nil
+			}
+			recvg.param = unsafe.Pointer(sg)
+			if sg.releasetime != 0 {
+				sg.releasetime = cputicks()
+			}
+			goready(recvg)
+			return true
+		}
+
+		if !block {
+			unlock(&c.lock)
+			return false
+		}
+
+		// no receiver available: block on this channel.
+		gp := getg()
+		mysg := acquireSudog()
+		mysg.releasetime = 0
+		if t0 != 0 {
+			mysg.releasetime = -1
+		}
+		mysg.elem = ep
+		mysg.waitlink = nil
+		gp.waiting = mysg
+		mysg.g = gp
+		mysg.selectdone = nil
+		gp.param = nil
+		c.sendq.enqueue(mysg)
+		goparkunlock(&c.lock, "chan send")
+
+		// someone woke us up.
+		if mysg != gp.waiting {
+			gothrow("G waiting list is corrupted!")
+		}
+		gp.waiting = nil
+		if gp.param == nil {
+			if c.closed == 0 {
+				gothrow("chansend: spurious wakeup")
+			}
+			panic("send on closed channel")
+		}
+		gp.param = nil
+		if mysg.releasetime > 0 {
+			blockevent(int64(mysg.releasetime)-t0, 2)
+		}
+		releaseSudog(mysg)
+		return true
+	}
+
+	// asynchronous channel
+	// wait for some space to write our data
+	var t1 int64
+	for c.qcount >= c.dataqsiz {
+		if !block {
+			unlock(&c.lock)
+			return false
+		}
+		gp := getg()
+		mysg := acquireSudog()
+		mysg.releasetime = 0
+		if t0 != 0 {
+			mysg.releasetime = -1
+		}
+		mysg.g = gp
+		mysg.elem = nil
+		mysg.selectdone = nil
+		c.sendq.enqueue(mysg)
+		goparkunlock(&c.lock, "chan send")
+
+		// someone woke us up - try again
+		if mysg.releasetime > 0 {
+			t1 = mysg.releasetime
+		}
+		releaseSudog(mysg)
+		lock(&c.lock)
+		if c.closed != 0 {
+			unlock(&c.lock)
+			panic("send on closed channel")
+		}
+	}
+
+	// write our data into the channel buffer
+	if raceenabled {
+		raceacquire(chanbuf(c, c.sendx))
+		racerelease(chanbuf(c, c.sendx))
+	}
+	memmove(chanbuf(c, c.sendx), ep, uintptr(c.elemsize))
+	c.sendx++
+	if c.sendx == c.dataqsiz {
+		c.sendx = 0
+	}
+	c.qcount++
+
+	// wake up a waiting receiver
+	sg := c.recvq.dequeue()
+	if sg != nil {
+		recvg := sg.g
+		unlock(&c.lock)
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		goready(recvg)
+	} else {
+		unlock(&c.lock)
+	}
+	if t1 > 0 {
+		blockevent(t1-t0, 2)
+	}
+	return true
+}
+
+func closechan(c *hchan) {
+	if c == nil {
+		panic("close of nil channel")
+	}
+
+	lock(&c.lock)
+	if c.closed != 0 {
+		unlock(&c.lock)
+		panic("close of closed channel")
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&c))
+		racewritepc(unsafe.Pointer(c), callerpc, funcPC(closechan))
+		racerelease(unsafe.Pointer(c))
+	}
+
+	c.closed = 1
+
+	// release all readers
+	for {
+		sg := c.recvq.dequeue()
+		if sg == nil {
+			break
+		}
+		gp := sg.g
+		sg.elem = nil
+		gp.param = nil
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		goready(gp)
+	}
+
+	// release all writers
+	for {
+		sg := c.sendq.dequeue()
+		if sg == nil {
+			break
+		}
+		gp := sg.g
+		sg.elem = nil
+		gp.param = nil
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		goready(gp)
+	}
+	unlock(&c.lock)
+}
+
+// entry points for <- c from compiled code
+//go:nosplit
+func chanrecv1(t *chantype, c *hchan, elem unsafe.Pointer) {
+	chanrecv(t, c, elem, true)
+}
+
+//go:nosplit
+func chanrecv2(t *chantype, c *hchan, elem unsafe.Pointer) (received bool) {
+	_, received = chanrecv(t, c, elem, true)
+	return
+}
+
+// chanrecv receives on channel c and writes the received data to ep.
+// ep may be nil, in which case received data is ignored.
+// If block == false and no elements are available, returns (false, false).
+// Otherwise, if c is closed, zeros *ep and returns (true, false).
+// Otherwise, fills in *ep with an element and returns (true, true).
+func chanrecv(t *chantype, c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) {
+	// raceenabled: don't need to check ep, as it is always on the stack.
+
+	if debugChan {
+		print("chanrecv: chan=", c, "\n")
+	}
+
+	if c == nil {
+		if !block {
+			return
+		}
+		gopark(nil, nil, "chan receive (nil chan)")
+		gothrow("unreachable")
+	}
+
+	// Fast path: check for failed non-blocking operation without acquiring the lock.
+	//
+	// After observing that the channel is not ready for receiving, we observe that the
+	// channel is not closed. Each of these observations is a single word-sized read
+	// (first c.sendq.first or c.qcount, and second c.closed).
+	// Because a channel cannot be reopened, the later observation of the channel
+	// being not closed implies that it was also not closed at the moment of the
+	// first observation. We behave as if we observed the channel at that moment
+	// and report that the receive cannot proceed.
+	//
+	// The order of operations is important here: reversing the operations can lead to
+	// incorrect behavior when racing with a close.
+	if !block && (c.dataqsiz == 0 && c.sendq.first == nil ||
+		c.dataqsiz > 0 && atomicloaduint(&c.qcount) == 0) &&
+		atomicload(&c.closed) == 0 {
+		return
+	}
+
+	var t0 int64
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+	}
+
+	lock(&c.lock)
+	if c.dataqsiz == 0 { // synchronous channel
+		if c.closed != 0 {
+			return recvclosed(c, ep)
+		}
+
+		sg := c.sendq.dequeue()
+		if sg != nil {
+			if raceenabled {
+				racesync(c, sg)
+			}
+			unlock(&c.lock)
+
+			if ep != nil {
+				memmove(ep, sg.elem, uintptr(c.elemsize))
+			}
+			sg.elem = nil
+			gp := sg.g
+			gp.param = unsafe.Pointer(sg)
+			if sg.releasetime != 0 {
+				sg.releasetime = cputicks()
+			}
+			goready(gp)
+			selected = true
+			received = true
+			return
+		}
+
+		if !block {
+			unlock(&c.lock)
+			return
+		}
+
+		// no sender available: block on this channel.
+		gp := getg()
+		mysg := acquireSudog()
+		mysg.releasetime = 0
+		if t0 != 0 {
+			mysg.releasetime = -1
+		}
+		mysg.elem = ep
+		mysg.waitlink = nil
+		gp.waiting = mysg
+		mysg.g = gp
+		mysg.selectdone = nil
+		gp.param = nil
+		c.recvq.enqueue(mysg)
+		goparkunlock(&c.lock, "chan receive")
+
+		// someone woke us up
+		if mysg != gp.waiting {
+			gothrow("G waiting list is corrupted!")
+		}
+		gp.waiting = nil
+		if mysg.releasetime > 0 {
+			blockevent(mysg.releasetime-t0, 2)
+		}
+		haveData := gp.param != nil
+		gp.param = nil
+		releaseSudog(mysg)
+
+		if haveData {
+			// a sender sent us some data. It already wrote to ep.
+			selected = true
+			received = true
+			return
+		}
+
+		lock(&c.lock)
+		if c.closed == 0 {
+			gothrow("chanrecv: spurious wakeup")
+		}
+		return recvclosed(c, ep)
+	}
+
+	// asynchronous channel
+	// wait for some data to appear
+	var t1 int64
+	for c.qcount <= 0 {
+		if c.closed != 0 {
+			selected, received = recvclosed(c, ep)
+			if t1 > 0 {
+				blockevent(t1-t0, 2)
+			}
+			return
+		}
+
+		if !block {
+			unlock(&c.lock)
+			return
+		}
+
+		// wait for someone to send an element
+		gp := getg()
+		mysg := acquireSudog()
+		mysg.releasetime = 0
+		if t0 != 0 {
+			mysg.releasetime = -1
+		}
+		mysg.elem = nil
+		mysg.g = gp
+		mysg.selectdone = nil
+
+		c.recvq.enqueue(mysg)
+		goparkunlock(&c.lock, "chan receive")
+
+		// someone woke us up - try again
+		if mysg.releasetime > 0 {
+			t1 = mysg.releasetime
+		}
+		releaseSudog(mysg)
+		lock(&c.lock)
+	}
+
+	if raceenabled {
+		raceacquire(chanbuf(c, c.recvx))
+		racerelease(chanbuf(c, c.recvx))
+	}
+	if ep != nil {
+		memmove(ep, chanbuf(c, c.recvx), uintptr(c.elemsize))
+	}
+	memclr(chanbuf(c, c.recvx), uintptr(c.elemsize))
+
+	c.recvx++
+	if c.recvx == c.dataqsiz {
+		c.recvx = 0
+	}
+	c.qcount--
+
+	// ping a sender now that there is space
+	sg := c.sendq.dequeue()
+	if sg != nil {
+		gp := sg.g
+		unlock(&c.lock)
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		goready(gp)
+	} else {
+		unlock(&c.lock)
+	}
+
+	if t1 > 0 {
+		blockevent(t1-t0, 2)
+	}
+	selected = true
+	received = true
+	return
+}
+
+// recvclosed is a helper function for chanrecv.  Handles cleanup
+// when the receiver encounters a closed channel.
+// Caller must hold c.lock, recvclosed will release the lock.
+func recvclosed(c *hchan, ep unsafe.Pointer) (selected, recevied bool) {
+	if raceenabled {
+		raceacquire(unsafe.Pointer(c))
+	}
+	unlock(&c.lock)
+	if ep != nil {
+		memclr(ep, uintptr(c.elemsize))
+	}
+	return true, false
+}
+
+// compiler implements
+//
+//	select {
+//	case c <- v:
+//		... foo
+//	default:
+//		... bar
+//	}
+//
+// as
+//
+//	if selectnbsend(c, v) {
+//		... foo
+//	} else {
+//		... bar
+//	}
+//
+func selectnbsend(t *chantype, c *hchan, elem unsafe.Pointer) (selected bool) {
+	return chansend(t, c, elem, false, getcallerpc(unsafe.Pointer(&t)))
+}
+
+// compiler implements
+//
+//	select {
+//	case v = <-c:
+//		... foo
+//	default:
+//		... bar
+//	}
+//
+// as
+//
+//	if selectnbrecv(&v, c) {
+//		... foo
+//	} else {
+//		... bar
+//	}
+//
+func selectnbrecv(t *chantype, elem unsafe.Pointer, c *hchan) (selected bool) {
+	selected, _ = chanrecv(t, c, elem, false)
+	return
+}
+
+// compiler implements
+//
+//	select {
+//	case v, ok = <-c:
+//		... foo
+//	default:
+//		... bar
+//	}
+//
+// as
+//
+//	if c != nil && selectnbrecv2(&v, &ok, c) {
+//		... foo
+//	} else {
+//		... bar
+//	}
+//
+func selectnbrecv2(t *chantype, elem unsafe.Pointer, received *bool, c *hchan) (selected bool) {
+	// TODO(khr): just return 2 values from this function, now that it is in Go.
+	selected, *received = chanrecv(t, c, elem, false)
+	return
+}
+
+func reflect_chansend(t *chantype, c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
+	return chansend(t, c, elem, !nb, getcallerpc(unsafe.Pointer(&t)))
+}
+
+func reflect_chanrecv(t *chantype, c *hchan, nb bool, elem unsafe.Pointer) (selected bool, received bool) {
+	return chanrecv(t, c, elem, !nb)
+}
+
+func reflect_chanlen(c *hchan) int {
+	if c == nil {
+		return 0
+	}
+	return int(c.qcount)
+}
+
+func reflect_chancap(c *hchan) int {
+	if c == nil {
+		return 0
+	}
+	return int(c.dataqsiz)
+}
+
+func (q *waitq) enqueue(sgp *sudog) {
+	sgp.next = nil
+	if q.first == nil {
+		q.first = sgp
+		q.last = sgp
+		return
+	}
+	q.last.next = sgp
+	q.last = sgp
+}
+
+func (q *waitq) dequeue() *sudog {
+	for {
+		sgp := q.first
+		if sgp == nil {
+			return nil
+		}
+		q.first = sgp.next
+		sgp.next = nil
+		if q.last == sgp {
+			q.last = nil
+		}
+
+		// if sgp participates in a select and is already signaled, ignore it
+		if sgp.selectdone != nil {
+			// claim the right to signal
+			if *sgp.selectdone != 0 || !cas(sgp.selectdone, 0, 1) {
+				continue
+			}
+		}
+
+		return sgp
+	}
+}
+
+func racesync(c *hchan, sg *sudog) {
+	racerelease(chanbuf(c, 0))
+	raceacquireg(sg.g, chanbuf(c, 0))
+	racereleaseg(sg.g, chanbuf(c, 0))
+	raceacquire(chanbuf(c, 0))
+}
diff --git a/src/runtime/chan.h b/src/runtime/chan.h
new file mode 100644
index 000000000..c34ff1533
--- /dev/null
+++ b/src/runtime/chan.h
@@ -0,0 +1,68 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define	MAXALIGN	8
+
+typedef	struct	WaitQ	WaitQ;
+typedef	struct	Select	Select;
+typedef	struct	Scase	Scase;
+
+struct	WaitQ
+{
+	SudoG*	first;
+	SudoG*	last;
+};
+
+struct	Hchan
+{
+	uintgo	qcount;			// total data in the q
+	uintgo	dataqsiz;		// size of the circular q
+	byte*	buf;
+	uint16	elemsize;
+	uint32	closed;
+	Type*	elemtype;		// element type
+	uintgo	sendx;			// send index
+	uintgo	recvx;			// receive index
+	WaitQ	recvq;			// list of recv waiters
+	WaitQ	sendq;			// list of send waiters
+	Mutex	lock;
+};
+
+// Buffer follows Hchan immediately in memory.
+// chanbuf(c, i) is pointer to the i'th slot in the buffer.
+#define chanbuf(c, i) ((byte*)((c)->buf)+(uintptr)(c)->elemsize*(i))
+
+enum
+{
+	debug = 0,
+
+	// Scase.kind
+	CaseRecv,
+	CaseSend,
+	CaseDefault,
+};
+
+// Known to compiler.
+// Changes here must also be made in src/cmd/gc/select.c's selecttype.
+struct	Scase
+{
+	void*	elem;			// data element
+	Hchan*	chan;			// chan
+	uintptr	pc;			// return pc
+	uint16	kind;
+	uint16	so;			// vararg of selected bool
+	bool*	receivedp;		// pointer to received bool (recv2)
+	int64	releasetime;
+};
+
+// Known to compiler.
+// Changes here must also be made in src/cmd/gc/select.c's selecttype.
+struct	Select
+{
+	uint16	tcase;			// total count of scase[]
+	uint16	ncase;			// currently filled scase[]
+	uint16*	pollorder;		// case poll order
+	Hchan**	lockorder;		// channel lock order
+	Scase	scase[1];		// one per case (in order of appearance)
+};
diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
new file mode 100644
index 000000000..e689ceaed
--- /dev/null
+++ b/src/runtime/chan_test.go
@@ -0,0 +1,820 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+func TestChan(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	N := 200
+	if testing.Short() {
+		N = 20
+	}
+	for chanCap := 0; chanCap < N; chanCap++ {
+		{
+			// Ensure that receive from empty chan blocks.
+			c := make(chan int, chanCap)
+			recv1 := false
+			go func() {
+				_ = <-c
+				recv1 = true
+			}()
+			recv2 := false
+			go func() {
+				_, _ = <-c
+				recv2 = true
+			}()
+			time.Sleep(time.Millisecond)
+			if recv1 || recv2 {
+				t.Fatalf("chan[%d]: receive from empty chan", chanCap)
+			}
+			// Ensure that non-blocking receive does not block.
+			select {
+			case _ = <-c:
+				t.Fatalf("chan[%d]: receive from empty chan", chanCap)
+			default:
+			}
+			select {
+			case _, _ = <-c:
+				t.Fatalf("chan[%d]: receive from empty chan", chanCap)
+			default:
+			}
+			c <- 0
+			c <- 0
+		}
+
+		{
+			// Ensure that send to full chan blocks.
+			c := make(chan int, chanCap)
+			for i := 0; i < chanCap; i++ {
+				c <- i
+			}
+			sent := uint32(0)
+			go func() {
+				c <- 0
+				atomic.StoreUint32(&sent, 1)
+			}()
+			time.Sleep(time.Millisecond)
+			if atomic.LoadUint32(&sent) != 0 {
+				t.Fatalf("chan[%d]: send to full chan", chanCap)
+			}
+			// Ensure that non-blocking send does not block.
+			select {
+			case c <- 0:
+				t.Fatalf("chan[%d]: send to full chan", chanCap)
+			default:
+			}
+			<-c
+		}
+
+		{
+			// Ensure that we receive 0 from closed chan.
+			c := make(chan int, chanCap)
+			for i := 0; i < chanCap; i++ {
+				c <- i
+			}
+			close(c)
+			for i := 0; i < chanCap; i++ {
+				v := <-c
+				if v != i {
+					t.Fatalf("chan[%d]: received %v, expected %v", chanCap, v, i)
+				}
+			}
+			if v := <-c; v != 0 {
+				t.Fatalf("chan[%d]: received %v, expected %v", chanCap, v, 0)
+			}
+			if v, ok := <-c; v != 0 || ok {
+				t.Fatalf("chan[%d]: received %v/%v, expected %v/%v", chanCap, v, ok, 0, false)
+			}
+		}
+
+		{
+			// Ensure that close unblocks receive.
+			c := make(chan int, chanCap)
+			done := make(chan bool)
+			go func() {
+				v, ok := <-c
+				done <- v == 0 && ok == false
+			}()
+			time.Sleep(time.Millisecond)
+			close(c)
+			if !<-done {
+				t.Fatalf("chan[%d]: received non zero from closed chan", chanCap)
+			}
+		}
+
+		{
+			// Send 100 integers,
+			// ensure that we receive them non-corrupted in FIFO order.
+			c := make(chan int, chanCap)
+			go func() {
+				for i := 0; i < 100; i++ {
+					c <- i
+				}
+			}()
+			for i := 0; i < 100; i++ {
+				v := <-c
+				if v != i {
+					t.Fatalf("chan[%d]: received %v, expected %v", chanCap, v, i)
+				}
+			}
+
+			// Same, but using recv2.
+			go func() {
+				for i := 0; i < 100; i++ {
+					c <- i
+				}
+			}()
+			for i := 0; i < 100; i++ {
+				v, ok := <-c
+				if !ok {
+					t.Fatalf("chan[%d]: receive failed, expected %v", chanCap, i)
+				}
+				if v != i {
+					t.Fatalf("chan[%d]: received %v, expected %v", chanCap, v, i)
+				}
+			}
+
+			// Send 1000 integers in 4 goroutines,
+			// ensure that we receive what we send.
+			const P = 4
+			const L = 1000
+			for p := 0; p < P; p++ {
+				go func() {
+					for i := 0; i < L; i++ {
+						c <- i
+					}
+				}()
+			}
+			done := make(chan map[int]int)
+			for p := 0; p < P; p++ {
+				go func() {
+					recv := make(map[int]int)
+					for i := 0; i < L; i++ {
+						v := <-c
+						recv[v] = recv[v] + 1
+					}
+					done <- recv
+				}()
+			}
+			recv := make(map[int]int)
+			for p := 0; p < P; p++ {
+				for k, v := range <-done {
+					recv[k] = recv[k] + v
+				}
+			}
+			if len(recv) != L {
+				t.Fatalf("chan[%d]: received %v values, expected %v", chanCap, len(recv), L)
+			}
+			for _, v := range recv {
+				if v != P {
+					t.Fatalf("chan[%d]: received %v values, expected %v", chanCap, v, P)
+				}
+			}
+		}
+
+		{
+			// Test len/cap.
+			c := make(chan int, chanCap)
+			if len(c) != 0 || cap(c) != chanCap {
+				t.Fatalf("chan[%d]: bad len/cap, expect %v/%v, got %v/%v", chanCap, 0, chanCap, len(c), cap(c))
+			}
+			for i := 0; i < chanCap; i++ {
+				c <- i
+			}
+			if len(c) != chanCap || cap(c) != chanCap {
+				t.Fatalf("chan[%d]: bad len/cap, expect %v/%v, got %v/%v", chanCap, chanCap, chanCap, len(c), cap(c))
+			}
+		}
+
+	}
+}
+
+func TestNonblockRecvRace(t *testing.T) {
+	n := 10000
+	if testing.Short() {
+		n = 100
+	}
+	for i := 0; i < n; i++ {
+		c := make(chan int, 1)
+		c <- 1
+		go func() {
+			select {
+			case <-c:
+			default:
+				t.Fatal("chan is not ready")
+			}
+		}()
+		close(c)
+		<-c
+	}
+}
+
+func TestSelfSelect(t *testing.T) {
+	// Ensure that send/recv on the same chan in select
+	// does not crash nor deadlock.
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	for _, chanCap := range []int{0, 10} {
+		var wg sync.WaitGroup
+		wg.Add(2)
+		c := make(chan int, chanCap)
+		for p := 0; p < 2; p++ {
+			p := p
+			go func() {
+				defer wg.Done()
+				for i := 0; i < 1000; i++ {
+					if p == 0 || i%2 == 0 {
+						select {
+						case c <- p:
+						case v := <-c:
+							if chanCap == 0 && v == p {
+								t.Fatalf("self receive")
+							}
+						}
+					} else {
+						select {
+						case v := <-c:
+							if chanCap == 0 && v == p {
+								t.Fatalf("self receive")
+							}
+						case c <- p:
+						}
+					}
+				}
+			}()
+		}
+		wg.Wait()
+	}
+}
+
+func TestSelectStress(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(10))
+	var c [4]chan int
+	c[0] = make(chan int)
+	c[1] = make(chan int)
+	c[2] = make(chan int, 2)
+	c[3] = make(chan int, 3)
+	N := int(1e5)
+	if testing.Short() {
+		N /= 10
+	}
+	// There are 4 goroutines that send N values on each of the chans,
+	// + 4 goroutines that receive N values on each of the chans,
+	// + 1 goroutine that sends N values on each of the chans in a single select,
+	// + 1 goroutine that receives N values on each of the chans in a single select.
+	// All these sends, receives and selects interact chaotically at runtime,
+	// but we are careful that this whole construct does not deadlock.
+	var wg sync.WaitGroup
+	wg.Add(10)
+	for k := 0; k < 4; k++ {
+		k := k
+		go func() {
+			for i := 0; i < N; i++ {
+				c[k] <- 0
+			}
+			wg.Done()
+		}()
+		go func() {
+			for i := 0; i < N; i++ {
+				<-c[k]
+			}
+			wg.Done()
+		}()
+	}
+	go func() {
+		var n [4]int
+		c1 := c
+		for i := 0; i < 4*N; i++ {
+			select {
+			case c1[3] <- 0:
+				n[3]++
+				if n[3] == N {
+					c1[3] = nil
+				}
+			case c1[2] <- 0:
+				n[2]++
+				if n[2] == N {
+					c1[2] = nil
+				}
+			case c1[0] <- 0:
+				n[0]++
+				if n[0] == N {
+					c1[0] = nil
+				}
+			case c1[1] <- 0:
+				n[1]++
+				if n[1] == N {
+					c1[1] = nil
+				}
+			}
+		}
+		wg.Done()
+	}()
+	go func() {
+		var n [4]int
+		c1 := c
+		for i := 0; i < 4*N; i++ {
+			select {
+			case <-c1[0]:
+				n[0]++
+				if n[0] == N {
+					c1[0] = nil
+				}
+			case <-c1[1]:
+				n[1]++
+				if n[1] == N {
+					c1[1] = nil
+				}
+			case <-c1[2]:
+				n[2]++
+				if n[2] == N {
+					c1[2] = nil
+				}
+			case <-c1[3]:
+				n[3]++
+				if n[3] == N {
+					c1[3] = nil
+				}
+			}
+		}
+		wg.Done()
+	}()
+	wg.Wait()
+}
+
+func TestChanSendInterface(t *testing.T) {
+	type mt struct{}
+	m := &mt{}
+	c := make(chan interface{}, 1)
+	c <- m
+	select {
+	case c <- m:
+	default:
+	}
+	select {
+	case c <- m:
+	case c <- &mt{}:
+	default:
+	}
+}
+
+func TestPseudoRandomSend(t *testing.T) {
+	n := 100
+	for _, chanCap := range []int{0, n} {
+		c := make(chan int, chanCap)
+		l := make([]int, n)
+		var m sync.Mutex
+		m.Lock()
+		go func() {
+			for i := 0; i < n; i++ {
+				runtime.Gosched()
+				l[i] = <-c
+			}
+			m.Unlock()
+		}()
+		for i := 0; i < n; i++ {
+			select {
+			case c <- 1:
+			case c <- 0:
+			}
+		}
+		m.Lock() // wait
+		n0 := 0
+		n1 := 0
+		for _, i := range l {
+			n0 += (i + 1) % 2
+			n1 += i
+		}
+		if n0 <= n/10 || n1 <= n/10 {
+			t.Errorf("Want pseudorandom, got %d zeros and %d ones (chan cap %d)", n0, n1, chanCap)
+		}
+	}
+}
+
+func TestMultiConsumer(t *testing.T) {
+	const nwork = 23
+	const niter = 271828
+
+	pn := []int{2, 3, 7, 11, 13, 17, 19, 23, 27, 31}
+
+	q := make(chan int, nwork*3)
+	r := make(chan int, nwork*3)
+
+	// workers
+	var wg sync.WaitGroup
+	for i := 0; i < nwork; i++ {
+		wg.Add(1)
+		go func(w int) {
+			for v := range q {
+				// mess with the fifo-ish nature of range
+				if pn[w%len(pn)] == v {
+					runtime.Gosched()
+				}
+				r <- v
+			}
+			wg.Done()
+		}(i)
+	}
+
+	// feeder & closer
+	expect := 0
+	go func() {
+		for i := 0; i < niter; i++ {
+			v := pn[i%len(pn)]
+			expect += v
+			q <- v
+		}
+		close(q)  // no more work
+		wg.Wait() // workers done
+		close(r)  // ... so there can be no more results
+	}()
+
+	// consume & check
+	n := 0
+	s := 0
+	for v := range r {
+		n++
+		s += v
+	}
+	if n != niter || s != expect {
+		t.Errorf("Expected sum %d (got %d) from %d iter (saw %d)",
+			expect, s, niter, n)
+	}
+}
+
+func TestShrinkStackDuringBlockedSend(t *testing.T) {
+	// make sure that channel operations still work when we are
+	// blocked on a channel send and we shrink the stack.
+	// NOTE: this test probably won't fail unless stack.c:StackDebug
+	// is set to >= 1.
+	const n = 10
+	c := make(chan int)
+	done := make(chan struct{})
+
+	go func() {
+		for i := 0; i < n; i++ {
+			c <- i
+			// use lots of stack, briefly.
+			stackGrowthRecursive(20)
+		}
+		done <- struct{}{}
+	}()
+
+	for i := 0; i < n; i++ {
+		x := <-c
+		if x != i {
+			t.Errorf("bad channel read: want %d, got %d", i, x)
+		}
+		// Waste some time so sender can finish using lots of stack
+		// and block in channel send.
+		time.Sleep(1 * time.Millisecond)
+		// trigger GC which will shrink the stack of the sender.
+		runtime.GC()
+	}
+	<-done
+}
+
+func TestSelectDuplicateChannel(t *testing.T) {
+	// This test makes sure we can queue a G on
+	// the same channel multiple times.
+	c := make(chan int)
+	d := make(chan int)
+	e := make(chan int)
+
+	// goroutine A
+	go func() {
+		select {
+		case <-c:
+		case <-c:
+		case <-d:
+		}
+		e <- 9
+	}()
+	time.Sleep(time.Millisecond) // make sure goroutine A gets qeueued first on c
+
+	// goroutine B
+	go func() {
+		<-c
+	}()
+	time.Sleep(time.Millisecond) // make sure goroutine B gets queued on c before continuing
+
+	d <- 7 // wake up A, it dequeues itself from c.  This operation used to corrupt c.recvq.
+	<-e    // A tells us it's done
+	c <- 8 // wake up B.  This operation used to fail because c.recvq was corrupted (it tries to wake up an already running G instead of B)
+}
+
+func BenchmarkChanNonblocking(b *testing.B) {
+	myc := make(chan int)
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			select {
+			case <-myc:
+			default:
+			}
+		}
+	})
+}
+
+func BenchmarkSelectUncontended(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		myc1 := make(chan int, 1)
+		myc2 := make(chan int, 1)
+		myc1 <- 0
+		for pb.Next() {
+			select {
+			case <-myc1:
+				myc2 <- 0
+			case <-myc2:
+				myc1 <- 0
+			}
+		}
+	})
+}
+
+func BenchmarkSelectSyncContended(b *testing.B) {
+	myc1 := make(chan int)
+	myc2 := make(chan int)
+	myc3 := make(chan int)
+	done := make(chan int)
+	b.RunParallel(func(pb *testing.PB) {
+		go func() {
+			for {
+				select {
+				case myc1 <- 0:
+				case myc2 <- 0:
+				case myc3 <- 0:
+				case <-done:
+					return
+				}
+			}
+		}()
+		for pb.Next() {
+			select {
+			case <-myc1:
+			case <-myc2:
+			case <-myc3:
+			}
+		}
+	})
+	close(done)
+}
+
+func BenchmarkSelectAsyncContended(b *testing.B) {
+	procs := runtime.GOMAXPROCS(0)
+	myc1 := make(chan int, procs)
+	myc2 := make(chan int, procs)
+	b.RunParallel(func(pb *testing.PB) {
+		myc1 <- 0
+		for pb.Next() {
+			select {
+			case <-myc1:
+				myc2 <- 0
+			case <-myc2:
+				myc1 <- 0
+			}
+		}
+	})
+}
+
+func BenchmarkSelectNonblock(b *testing.B) {
+	myc1 := make(chan int)
+	myc2 := make(chan int)
+	myc3 := make(chan int, 1)
+	myc4 := make(chan int, 1)
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			select {
+			case <-myc1:
+			default:
+			}
+			select {
+			case myc2 <- 0:
+			default:
+			}
+			select {
+			case <-myc3:
+			default:
+			}
+			select {
+			case myc4 <- 0:
+			default:
+			}
+		}
+	})
+}
+
+func BenchmarkChanUncontended(b *testing.B) {
+	const C = 100
+	b.RunParallel(func(pb *testing.PB) {
+		myc := make(chan int, C)
+		for pb.Next() {
+			for i := 0; i < C; i++ {
+				myc <- 0
+			}
+			for i := 0; i < C; i++ {
+				<-myc
+			}
+		}
+	})
+}
+
+func BenchmarkChanContended(b *testing.B) {
+	const C = 100
+	myc := make(chan int, C*runtime.GOMAXPROCS(0))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for i := 0; i < C; i++ {
+				myc <- 0
+			}
+			for i := 0; i < C; i++ {
+				<-myc
+			}
+		}
+	})
+}
+
+func BenchmarkChanSync(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := 2
+	N := int32(b.N / CallsPerSched / procs * procs)
+	c := make(chan bool, procs)
+	myc := make(chan int)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for {
+				i := atomic.AddInt32(&N, -1)
+				if i < 0 {
+					break
+				}
+				for g := 0; g < CallsPerSched; g++ {
+					if i%2 == 0 {
+						<-myc
+						myc <- 0
+					} else {
+						myc <- 0
+						<-myc
+					}
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, 2*procs)
+	myc := make(chan int, chanSize)
+	for p := 0; p < procs; p++ {
+		go func() {
+			foo := 0
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					for i := 0; i < localWork; i++ {
+						foo *= 2
+						foo /= 2
+					}
+					myc <- 1
+				}
+			}
+			myc <- 0
+			c <- foo == 42
+		}()
+		go func() {
+			foo := 0
+			for {
+				v := <-myc
+				if v == 0 {
+					break
+				}
+				for i := 0; i < localWork; i++ {
+					foo *= 2
+					foo /= 2
+				}
+			}
+			c <- foo == 42
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+		<-c
+	}
+}
+
+func BenchmarkChanProdCons0(b *testing.B) {
+	benchmarkChanProdCons(b, 0, 0)
+}
+
+func BenchmarkChanProdCons10(b *testing.B) {
+	benchmarkChanProdCons(b, 10, 0)
+}
+
+func BenchmarkChanProdCons100(b *testing.B) {
+	benchmarkChanProdCons(b, 100, 0)
+}
+
+func BenchmarkChanProdConsWork0(b *testing.B) {
+	benchmarkChanProdCons(b, 0, 100)
+}
+
+func BenchmarkChanProdConsWork10(b *testing.B) {
+	benchmarkChanProdCons(b, 10, 100)
+}
+
+func BenchmarkChanProdConsWork100(b *testing.B) {
+	benchmarkChanProdCons(b, 100, 100)
+}
+
+func BenchmarkSelectProdCons(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, 2*procs)
+	myc := make(chan int, 128)
+	myclose := make(chan bool)
+	for p := 0; p < procs; p++ {
+		go func() {
+			// Producer: sends to myc.
+			foo := 0
+			// Intended to not fire during benchmarking.
+			mytimer := time.After(time.Hour)
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					// Model some local work.
+					for i := 0; i < 100; i++ {
+						foo *= 2
+						foo /= 2
+					}
+					select {
+					case myc <- 1:
+					case <-mytimer:
+					case <-myclose:
+					}
+				}
+			}
+			myc <- 0
+			c <- foo == 42
+		}()
+		go func() {
+			// Consumer: receives from myc.
+			foo := 0
+			// Intended to not fire during benchmarking.
+			mytimer := time.After(time.Hour)
+		loop:
+			for {
+				select {
+				case v := <-myc:
+					if v == 0 {
+						break loop
+					}
+				case <-mytimer:
+				case <-myclose:
+				}
+				// Model some local work.
+				for i := 0; i < 100; i++ {
+					foo *= 2
+					foo /= 2
+				}
+			}
+			c <- foo == 42
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+		<-c
+	}
+}
+
+func BenchmarkChanCreation(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			myc := make(chan int, 1)
+			myc <- 0
+			<-myc
+		}
+	})
+}
+
+func BenchmarkChanSem(b *testing.B) {
+	type Empty struct{}
+	myc := make(chan Empty, runtime.GOMAXPROCS(0))
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			myc <- Empty{}
+			<-myc
+		}
+	})
+}
diff --git a/src/runtime/closure_test.go b/src/runtime/closure_test.go
new file mode 100644
index 000000000..ea65fbd5f
--- /dev/null
+++ b/src/runtime/closure_test.go
@@ -0,0 +1,53 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package runtime_test
+
+import "testing"
+
+var s int
+
+func BenchmarkCallClosure(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		s += func(ii int) int { return 2 * ii }(i)
+	}
+}
+
+func BenchmarkCallClosure1(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		j := i
+		s += func(ii int) int { return 2*ii + j }(i)
+	}
+}
+
+var ss *int
+
+func BenchmarkCallClosure2(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		j := i
+		s += func() int {
+			ss = &j
+			return 2
+		}()
+	}
+}
+
+func addr1(x int) *int {
+	return func() *int { return &x }()
+}
+
+func BenchmarkCallClosure3(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		ss = addr1(i)
+	}
+}
+
+func addr2() (x int, p *int) {
+	return 0, func() *int { return &x }()
+}
+
+func BenchmarkCallClosure4(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		_, ss = addr2()
+	}
+}
diff --git a/src/runtime/compiler.go b/src/runtime/compiler.go
new file mode 100644
index 000000000..562a46022
--- /dev/null
+++ b/src/runtime/compiler.go
@@ -0,0 +1,13 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Compiler is the name of the compiler toolchain that built the
+// running binary.  Known toolchains are:
+//
+//	gc      The 5g/6g/8g compiler suite at code.google.com/p/go.
+//	gccgo   The gccgo front end, part of the GCC compiler suite.
+//
+const Compiler = "gc"
diff --git a/src/runtime/complex.go b/src/runtime/complex.go
new file mode 100644
index 000000000..ec50f8947
--- /dev/null
+++ b/src/runtime/complex.go
@@ -0,0 +1,52 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func complex128div(n complex128, d complex128) complex128 {
+	// Special cases as in C99.
+	ninf := real(n) == posinf || real(n) == neginf ||
+		imag(n) == posinf || imag(n) == neginf
+	dinf := real(d) == posinf || real(d) == neginf ||
+		imag(d) == posinf || imag(d) == neginf
+
+	nnan := !ninf && (real(n) != real(n) || imag(n) != imag(n))
+	dnan := !dinf && (real(d) != real(d) || imag(d) != imag(d))
+
+	switch {
+	case nnan || dnan:
+		return complex(nan, nan)
+	case ninf && !dinf:
+		return complex(posinf, posinf)
+	case !ninf && dinf:
+		return complex(0, 0)
+	case real(d) == 0 && imag(d) == 0:
+		if real(n) == 0 && imag(n) == 0 {
+			return complex(nan, nan)
+		} else {
+			return complex(posinf, posinf)
+		}
+	default:
+		// Standard complex arithmetic, factored to avoid unnecessary overflow.
+		a := real(d)
+		if a < 0 {
+			a = -a
+		}
+		b := imag(d)
+		if b < 0 {
+			b = -b
+		}
+		if a <= b {
+			ratio := real(d) / imag(d)
+			denom := real(d)*ratio + imag(d)
+			return complex((real(n)*ratio+imag(n))/denom,
+				(imag(n)*ratio-real(n))/denom)
+		} else {
+			ratio := imag(d) / real(d)
+			denom := imag(d)*ratio + real(d)
+			return complex((imag(n)*ratio+real(n))/denom,
+				(imag(n)-real(n)*ratio)/denom)
+		}
+	}
+}
diff --git a/src/runtime/complex_test.go b/src/runtime/complex_test.go
new file mode 100644
index 000000000..f41e6a357
--- /dev/null
+++ b/src/runtime/complex_test.go
@@ -0,0 +1,67 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math/cmplx"
+	"testing"
+)
+
+var result complex128
+
+func BenchmarkComplex128DivNormal(b *testing.B) {
+	d := 15 + 2i
+	n := 32 + 3i
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		n += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivNisNaN(b *testing.B) {
+	d := cmplx.NaN()
+	n := 32 + 3i
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		n += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivDisNaN(b *testing.B) {
+	d := 15 + 2i
+	n := cmplx.NaN()
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		d += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivNisInf(b *testing.B) {
+	d := 15 + 2i
+	n := cmplx.Inf()
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		d += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivDisInf(b *testing.B) {
+	d := cmplx.Inf()
+	n := 32 + 3i
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		n += 0.1i
+		res += n / d
+	}
+	result = res
+}
diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go
new file mode 100644
index 000000000..8b1c1c632
--- /dev/null
+++ b/src/runtime/cpuprof.go
@@ -0,0 +1,425 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// CPU profiling.
+// Based on algorithms and data structures used in
+// http://code.google.com/p/google-perftools/.
+//
+// The main difference between this code and the google-perftools
+// code is that this code is written to allow copying the profile data
+// to an arbitrary io.Writer, while the google-perftools code always
+// writes to an operating system file.
+//
+// The signal handler for the profiling clock tick adds a new stack trace
+// to a hash table tracking counts for recent traces.  Most clock ticks
+// hit in the cache.  In the event of a cache miss, an entry must be
+// evicted from the hash table, copied to a log that will eventually be
+// written as profile data.  The google-perftools code flushed the
+// log itself during the signal handler.  This code cannot do that, because
+// the io.Writer might block or need system calls or locks that are not
+// safe to use from within the signal handler.  Instead, we split the log
+// into two halves and let the signal handler fill one half while a goroutine
+// is writing out the other half.  When the signal handler fills its half, it
+// offers to swap with the goroutine.  If the writer is not done with its half,
+// we lose the stack trace for this clock tick (and record that loss).
+// The goroutine interacts with the signal handler by calling getprofile() to
+// get the next log piece to write, implicitly handing back the last log
+// piece it obtained.
+//
+// The state of this dance between the signal handler and the goroutine
+// is encoded in the Profile.handoff field.  If handoff == 0, then the goroutine
+// is not using either log half and is waiting (or will soon be waiting) for
+// a new piece by calling notesleep(&p->wait).  If the signal handler
+// changes handoff from 0 to non-zero, it must call notewakeup(&p->wait)
+// to wake the goroutine.  The value indicates the number of entries in the
+// log half being handed off.  The goroutine leaves the non-zero value in
+// place until it has finished processing the log half and then flips the number
+// back to zero.  Setting the high bit in handoff means that the profiling is over,
+// and the goroutine is now in charge of flushing the data left in the hash table
+// to the log and returning that data.
+//
+// The handoff field is manipulated using atomic operations.
+// For the most part, the manipulation of handoff is orderly: if handoff == 0
+// then the signal handler owns it and can change it to non-zero.
+// If handoff != 0 then the goroutine owns it and can change it to zero.
+// If that were the end of the story then we would not need to manipulate
+// handoff using atomic operations.  The operations are needed, however,
+// in order to let the log closer set the high bit to indicate "EOF" safely
+// in the situation when normally the goroutine "owns" handoff.
+
+package runtime
+
+import "unsafe"
+
+const (
+	numBuckets      = 1 << 10
+	logSize         = 1 << 17
+	assoc           = 4
+	maxCPUProfStack = 64
+)
+
+type cpuprofEntry struct {
+	count uintptr
+	depth uintptr
+	stack [maxCPUProfStack]uintptr
+}
+
+type cpuProfile struct {
+	on     bool    // profiling is on
+	wait   note    // goroutine waits here
+	count  uintptr // tick count
+	evicts uintptr // eviction count
+	lost   uintptr // lost ticks that need to be logged
+
+	// Active recent stack traces.
+	hash [numBuckets]struct {
+		entry [assoc]cpuprofEntry
+	}
+
+	// Log of traces evicted from hash.
+	// Signal handler has filled log[toggle][:nlog].
+	// Goroutine is writing log[1-toggle][:handoff].
+	log     [2][logSize / 2]uintptr
+	nlog    uintptr
+	toggle  int32
+	handoff uint32
+
+	// Writer state.
+	// Writer maintains its own toggle to avoid races
+	// looking at signal handler's toggle.
+	wtoggle  uint32
+	wholding bool // holding & need to release a log half
+	flushing bool // flushing hash table - profile is over
+	eodSent  bool // special end-of-data record sent; => flushing
+}
+
+var (
+	cpuprofLock mutex
+	cpuprof     *cpuProfile
+
+	eod = [3]uintptr{0, 1, 0}
+)
+
+func setcpuprofilerate_m() // proc.c
+
+func setcpuprofilerate(hz int32) {
+	g := getg()
+	g.m.scalararg[0] = uintptr(hz)
+	onM(setcpuprofilerate_m)
+}
+
+// lostProfileData is a no-op function used in profiles
+// to mark the number of profiling stack traces that were
+// discarded due to slow data writers.
+func lostProfileData() {}
+
+// SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
+// If hz <= 0, SetCPUProfileRate turns off profiling.
+// If the profiler is on, the rate cannot be changed without first turning it off.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.cpuprofile flag instead of calling
+// SetCPUProfileRate directly.
+func SetCPUProfileRate(hz int) {
+	// Clamp hz to something reasonable.
+	if hz < 0 {
+		hz = 0
+	}
+	if hz > 1000000 {
+		hz = 1000000
+	}
+
+	lock(&cpuprofLock)
+	if hz > 0 {
+		if cpuprof == nil {
+			cpuprof = (*cpuProfile)(sysAlloc(unsafe.Sizeof(cpuProfile{}), &memstats.other_sys))
+			if cpuprof == nil {
+				print("runtime: cpu profiling cannot allocate memory\n")
+				unlock(&cpuprofLock)
+				return
+			}
+		}
+		if cpuprof.on || cpuprof.handoff != 0 {
+			print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
+			unlock(&cpuprofLock)
+			return
+		}
+
+		cpuprof.on = true
+		// pprof binary header format.
+		// http://code.google.com/p/google-perftools/source/browse/trunk/src/profiledata.cc#117
+		p := &cpuprof.log[0]
+		p[0] = 0                 // count for header
+		p[1] = 3                 // depth for header
+		p[2] = 0                 // version number
+		p[3] = uintptr(1e6 / hz) // period (microseconds)
+		p[4] = 0
+		cpuprof.nlog = 5
+		cpuprof.toggle = 0
+		cpuprof.wholding = false
+		cpuprof.wtoggle = 0
+		cpuprof.flushing = false
+		cpuprof.eodSent = false
+		noteclear(&cpuprof.wait)
+
+		setcpuprofilerate(int32(hz))
+	} else if cpuprof != nil && cpuprof.on {
+		setcpuprofilerate(0)
+		cpuprof.on = false
+
+		// Now add is not running anymore, and getprofile owns the entire log.
+		// Set the high bit in prof->handoff to tell getprofile.
+		for {
+			n := cpuprof.handoff
+			if n&0x80000000 != 0 {
+				print("runtime: setcpuprofile(off) twice\n")
+			}
+			if cas(&cpuprof.handoff, n, n|0x80000000) {
+				if n == 0 {
+					// we did the transition from 0 -> nonzero so we wake getprofile
+					notewakeup(&cpuprof.wait)
+				}
+				break
+			}
+		}
+	}
+	unlock(&cpuprofLock)
+}
+
+func cpuproftick(pc *uintptr, n int32) {
+	if n > maxCPUProfStack {
+		n = maxCPUProfStack
+	}
+	s := (*[maxCPUProfStack]uintptr)(unsafe.Pointer(pc))[:n]
+	cpuprof.add(s)
+}
+
+// add adds the stack trace to the profile.
+// It is called from signal handlers and other limited environments
+// and cannot allocate memory or acquire locks that might be
+// held at the time of the signal, nor can it use substantial amounts
+// of stack.  It is allowed to call evict.
+func (p *cpuProfile) add(pc []uintptr) {
+	// Compute hash.
+	h := uintptr(0)
+	for _, x := range pc {
+		h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
+		h += x*31 + x*7 + x*3
+	}
+	p.count++
+
+	// Add to entry count if already present in table.
+	b := &p.hash[h%numBuckets]
+Assoc:
+	for i := range b.entry {
+		e := &b.entry[i]
+		if e.depth != uintptr(len(pc)) {
+			continue
+		}
+		for j := range pc {
+			if e.stack[j] != pc[j] {
+				continue Assoc
+			}
+		}
+		e.count++
+		return
+	}
+
+	// Evict entry with smallest count.
+	var e *cpuprofEntry
+	for i := range b.entry {
+		if e == nil || b.entry[i].count < e.count {
+			e = &b.entry[i]
+		}
+	}
+	if e.count > 0 {
+		if !p.evict(e) {
+			// Could not evict entry.  Record lost stack.
+			p.lost++
+			return
+		}
+		p.evicts++
+	}
+
+	// Reuse the newly evicted entry.
+	e.depth = uintptr(len(pc))
+	e.count = 1
+	copy(e.stack[:], pc)
+}
+
+// evict copies the given entry's data into the log, so that
+// the entry can be reused.  evict is called from add, which
+// is called from the profiling signal handler, so it must not
+// allocate memory or block.  It is safe to call flushlog.
+// evict returns true if the entry was copied to the log,
+// false if there was no room available.
+func (p *cpuProfile) evict(e *cpuprofEntry) bool {
+	d := e.depth
+	nslot := d + 2
+	log := &p.log[p.toggle]
+	if p.nlog+nslot > uintptr(len(p.log[0])) {
+		if !p.flushlog() {
+			return false
+		}
+		log = &p.log[p.toggle]
+	}
+
+	q := p.nlog
+	log[q] = e.count
+	q++
+	log[q] = d
+	q++
+	copy(log[q:], e.stack[:d])
+	q += d
+	p.nlog = q
+	e.count = 0
+	return true
+}
+
+// flushlog tries to flush the current log and switch to the other one.
+// flushlog is called from evict, called from add, called from the signal handler,
+// so it cannot allocate memory or block.  It can try to swap logs with
+// the writing goroutine, as explained in the comment at the top of this file.
+func (p *cpuProfile) flushlog() bool {
+	if !cas(&p.handoff, 0, uint32(p.nlog)) {
+		return false
+	}
+	notewakeup(&p.wait)
+
+	p.toggle = 1 - p.toggle
+	log := &p.log[p.toggle]
+	q := uintptr(0)
+	if p.lost > 0 {
+		lostPC := funcPC(lostProfileData)
+		log[0] = p.lost
+		log[1] = 1
+		log[2] = lostPC
+		q = 3
+		p.lost = 0
+	}
+	p.nlog = q
+	return true
+}
+
+// getprofile blocks until the next block of profiling data is available
+// and returns it as a []byte.  It is called from the writing goroutine.
+func (p *cpuProfile) getprofile() []byte {
+	if p == nil {
+		return nil
+	}
+
+	if p.wholding {
+		// Release previous log to signal handling side.
+		// Loop because we are racing against SetCPUProfileRate(0).
+		for {
+			n := p.handoff
+			if n == 0 {
+				print("runtime: phase error during cpu profile handoff\n")
+				return nil
+			}
+			if n&0x80000000 != 0 {
+				p.wtoggle = 1 - p.wtoggle
+				p.wholding = false
+				p.flushing = true
+				goto Flush
+			}
+			if cas(&p.handoff, n, 0) {
+				break
+			}
+		}
+		p.wtoggle = 1 - p.wtoggle
+		p.wholding = false
+	}
+
+	if p.flushing {
+		goto Flush
+	}
+
+	if !p.on && p.handoff == 0 {
+		return nil
+	}
+
+	// Wait for new log.
+	notetsleepg(&p.wait, -1)
+	noteclear(&p.wait)
+
+	switch n := p.handoff; {
+	case n == 0:
+		print("runtime: phase error during cpu profile wait\n")
+		return nil
+	case n == 0x80000000:
+		p.flushing = true
+		goto Flush
+	default:
+		n &^= 0x80000000
+
+		// Return new log to caller.
+		p.wholding = true
+
+		return uintptrBytes(p.log[p.wtoggle][:n])
+	}
+
+	// In flush mode.
+	// Add is no longer being called.  We own the log.
+	// Also, p->handoff is non-zero, so flushlog will return false.
+	// Evict the hash table into the log and return it.
+Flush:
+	for i := range p.hash {
+		b := &p.hash[i]
+		for j := range b.entry {
+			e := &b.entry[j]
+			if e.count > 0 && !p.evict(e) {
+				// Filled the log.  Stop the loop and return what we've got.
+				break Flush
+			}
+		}
+	}
+
+	// Return pending log data.
+	if p.nlog > 0 {
+		// Note that we're using toggle now, not wtoggle,
+		// because we're working on the log directly.
+		n := p.nlog
+		p.nlog = 0
+		return uintptrBytes(p.log[p.toggle][:n])
+	}
+
+	// Made it through the table without finding anything to log.
+	if !p.eodSent {
+		// We may not have space to append this to the partial log buf,
+		// so we always return a new slice for the end-of-data marker.
+		p.eodSent = true
+		return uintptrBytes(eod[:])
+	}
+
+	// Finally done.  Clean up and return nil.
+	p.flushing = false
+	if !cas(&p.handoff, p.handoff, 0) {
+		print("runtime: profile flush racing with something\n")
+	}
+	return nil
+}
+
+func uintptrBytes(p []uintptr) (ret []byte) {
+	pp := (*sliceStruct)(unsafe.Pointer(&p))
+	rp := (*sliceStruct)(unsafe.Pointer(&ret))
+
+	rp.array = pp.array
+	rp.len = pp.len * int(unsafe.Sizeof(p[0]))
+	rp.cap = rp.len
+
+	return
+}
+
+// CPUProfile returns the next chunk of binary CPU profiling stack trace data,
+// blocking until data is available.  If profiling is turned off and all the profile
+// data accumulated while it was on has been returned, CPUProfile returns nil.
+// The caller must save the returned data before calling CPUProfile again.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.cpuprofile flag instead of calling
+// CPUProfile directly.
+func CPUProfile() []byte {
+	return cpuprof.getprofile()
+}
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
new file mode 100644
index 000000000..972eedc62
--- /dev/null
+++ b/src/runtime/crash_cgo_test.go
@@ -0,0 +1,196 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+
+package runtime_test
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func TestCgoCrashHandler(t *testing.T) {
+	testCrashHandler(t, true)
+}
+
+func TestCgoSignalDeadlock(t *testing.T) {
+	if testing.Short() && runtime.GOOS == "windows" {
+		t.Skip("Skipping in short mode") // takes up to 64 seconds
+	}
+	got := executeTest(t, cgoSignalDeadlockSource, nil)
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+func TestCgoTraceback(t *testing.T) {
+	got := executeTest(t, cgoTracebackSource, nil)
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+func TestCgoExternalThreadPanic(t *testing.T) {
+	if runtime.GOOS == "plan9" {
+		t.Skipf("no pthreads on %s", runtime.GOOS)
+	}
+	csrc := cgoExternalThreadPanicC
+	if runtime.GOOS == "windows" {
+		csrc = cgoExternalThreadPanicC_windows
+	}
+	got := executeTest(t, cgoExternalThreadPanicSource, nil, "main.c", csrc)
+	want := "panic: BOOM"
+	if !strings.Contains(got, want) {
+		t.Fatalf("want failure containing %q. output:\n%s\n", want, got)
+	}
+}
+
+const cgoSignalDeadlockSource = `
+package main
+
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"time"
+)
+
+func main() {
+	runtime.GOMAXPROCS(100)
+	ping := make(chan bool)
+	go func() {
+		for i := 0; ; i++ {
+			runtime.Gosched()
+			select {
+			case done := <-ping:
+				if done {
+					ping <- true
+					return
+				}
+				ping <- true
+			default:
+			}
+			func() {
+				defer func() {
+					recover()
+				}()
+				var s *string
+				*s = ""
+			}()
+		}
+	}()
+	time.Sleep(time.Millisecond)
+	for i := 0; i < 64; i++ {
+		go func() {
+			runtime.LockOSThread()
+			select {}
+		}()
+		go func() {
+			runtime.LockOSThread()
+			select {}
+		}()
+		time.Sleep(time.Millisecond)
+		ping <- false
+		select {
+		case <-ping:
+		case <-time.After(time.Second):
+			fmt.Printf("HANG\n")
+			return
+		}
+	}
+	ping <- true
+	select {
+	case <-ping:
+	case <-time.After(time.Second):
+		fmt.Printf("HANG\n")
+		return
+	}
+	fmt.Printf("OK\n")
+}
+`
+
+const cgoTracebackSource = `
+package main
+
+/* void foo(void) {} */
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func main() {
+	C.foo()
+	buf := make([]byte, 1)
+	runtime.Stack(buf, true)
+	fmt.Printf("OK\n")
+}
+`
+
+const cgoExternalThreadPanicSource = `
+package main
+
+// void start(void);
+import "C"
+
+func main() {
+	C.start()
+	select {}
+}
+
+//export gopanic
+func gopanic() {
+	panic("BOOM")
+}
+`
+
+const cgoExternalThreadPanicC = `
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+
+void gopanic(void);
+
+static void*
+die(void* x)
+{
+	gopanic();
+	return 0;
+}
+
+void
+start(void)
+{
+	pthread_t t;
+	if(pthread_create(&t, 0, die, 0) != 0)
+		printf("pthread_create failed\n");
+}
+`
+
+const cgoExternalThreadPanicC_windows = `
+#include <stdlib.h>
+#include <stdio.h>
+
+void gopanic(void);
+
+static void*
+die(void* x)
+{
+	gopanic();
+	return 0;
+}
+
+void
+start(void)
+{
+	if(_beginthreadex(0, 0, die, 0, 0, 0) != 0)
+		printf("_beginthreadex failed\n");
+}
+`
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
new file mode 100644
index 000000000..211a0476f
--- /dev/null
+++ b/src/runtime/crash_test.go
@@ -0,0 +1,515 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+	"text/template"
+)
+
+// testEnv excludes GODEBUG from the environment
+// to prevent its output from breaking tests that
+// are trying to parse other command output.
+func testEnv(cmd *exec.Cmd) *exec.Cmd {
+	if cmd.Env != nil {
+		panic("environment already set")
+	}
+	for _, env := range os.Environ() {
+		if strings.HasPrefix(env, "GODEBUG=") {
+			continue
+		}
+		cmd.Env = append(cmd.Env, env)
+	}
+	return cmd
+}
+
+func executeTest(t *testing.T, templ string, data interface{}, extra ...string) string {
+	switch runtime.GOOS {
+	case "android", "nacl":
+		t.Skipf("skipping on %s", runtime.GOOS)
+	}
+
+	checkStaleRuntime(t)
+
+	st := template.Must(template.New("crashSource").Parse(templ))
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	src := filepath.Join(dir, "main.go")
+	f, err := os.Create(src)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	err = st.Execute(f, data)
+	if err != nil {
+		f.Close()
+		t.Fatalf("failed to execute template: %v", err)
+	}
+	if err := f.Close(); err != nil {
+		t.Fatalf("failed to close file: %v", err)
+	}
+
+	for i := 0; i < len(extra); i += 2 {
+		if err := ioutil.WriteFile(filepath.Join(dir, extra[i]), []byte(extra[i+1]), 0666); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	cmd := exec.Command("go", "build", "-o", "a.exe")
+	cmd.Dir = dir
+	out, err := testEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source: %v\n%s", err, out)
+	}
+
+	got, _ := testEnv(exec.Command(filepath.Join(dir, "a.exe"))).CombinedOutput()
+	return string(got)
+}
+
+func checkStaleRuntime(t *testing.T) {
+	// 'go run' uses the installed copy of runtime.a, which may be out of date.
+	out, err := testEnv(exec.Command("go", "list", "-f", "{{.Stale}}", "runtime")).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to execute 'go list': %v\n%v", err, string(out))
+	}
+	if string(out) != "false\n" {
+		t.Fatalf("Stale runtime.a. Run 'go install runtime'.")
+	}
+}
+
+func testCrashHandler(t *testing.T, cgo bool) {
+	type crashTest struct {
+		Cgo bool
+	}
+	output := executeTest(t, crashSource, &crashTest{Cgo: cgo})
+	want := "main: recovered done\nnew-thread: recovered done\nsecond-new-thread: recovered done\nmain-again: recovered done\n"
+	if output != want {
+		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+	}
+}
+
+func TestCrashHandler(t *testing.T) {
+	testCrashHandler(t, false)
+}
+
+func testDeadlock(t *testing.T, source string) {
+	output := executeTest(t, source, nil)
+	want := "fatal error: all goroutines are asleep - deadlock!\n"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+func TestSimpleDeadlock(t *testing.T) {
+	testDeadlock(t, simpleDeadlockSource)
+}
+
+func TestInitDeadlock(t *testing.T) {
+	testDeadlock(t, initDeadlockSource)
+}
+
+func TestLockedDeadlock(t *testing.T) {
+	testDeadlock(t, lockedDeadlockSource)
+}
+
+func TestLockedDeadlock2(t *testing.T) {
+	testDeadlock(t, lockedDeadlockSource2)
+}
+
+func TestGoexitDeadlock(t *testing.T) {
+	output := executeTest(t, goexitDeadlockSource, nil)
+	want := "no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.Contains(output, want) {
+		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
+	}
+}
+
+func TestStackOverflow(t *testing.T) {
+	output := executeTest(t, stackOverflowSource, nil)
+	want := "runtime: goroutine stack exceeds 4194304-byte limit\nfatal error: stack overflow"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+func TestThreadExhaustion(t *testing.T) {
+	output := executeTest(t, threadExhaustionSource, nil)
+	want := "runtime: program exceeds 10-thread limit\nfatal error: thread exhaustion"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+func TestRecursivePanic(t *testing.T) {
+	output := executeTest(t, recursivePanicSource, nil)
+	want := `wrap: bad
+panic: again
+
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
+func TestGoexitCrash(t *testing.T) {
+	output := executeTest(t, goexitExitSource, nil)
+	want := "no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.Contains(output, want) {
+		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
+	}
+}
+
+func TestGoexitDefer(t *testing.T) {
+	c := make(chan struct{})
+	go func() {
+		defer func() {
+			r := recover()
+			if r != nil {
+				t.Errorf("non-nil recover during Goexit")
+			}
+			c <- struct{}{}
+		}()
+		runtime.Goexit()
+	}()
+	// Note: if the defer fails to run, we will get a deadlock here
+	<-c
+}
+
+func TestGoNil(t *testing.T) {
+	output := executeTest(t, goNilSource, nil)
+	want := "go of nil func value"
+	if !strings.Contains(output, want) {
+		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
+	}
+}
+
+func TestMainGoroutineId(t *testing.T) {
+	output := executeTest(t, mainGoroutineIdSource, nil)
+	want := "panic: test\n\ngoroutine 1 [running]:\n"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+func TestBreakpoint(t *testing.T) {
+	output := executeTest(t, breakpointSource, nil)
+	want := "runtime.Breakpoint()"
+	if !strings.Contains(output, want) {
+		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
+	}
+}
+
+const crashSource = `
+package main
+
+import (
+	"fmt"
+	"runtime"
+)
+
+{{if .Cgo}}
+import "C"
+{{end}}
+
+func test(name string) {
+	defer func() {
+		if x := recover(); x != nil {
+			fmt.Printf(" recovered")
+		}
+		fmt.Printf(" done\n")
+	}()
+	fmt.Printf("%s:", name)
+	var s *string
+	_ = *s
+	fmt.Print("SHOULD NOT BE HERE")
+}
+
+func testInNewThread(name string) {
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		test(name)
+		c <- true
+	}()
+	<-c
+}
+
+func main() {
+	runtime.LockOSThread()
+	test("main")
+	testInNewThread("new-thread")
+	testInNewThread("second-new-thread")
+	test("main-again")
+}
+`
+
+const simpleDeadlockSource = `
+package main
+func main() {
+	select {}
+}
+`
+
+const initDeadlockSource = `
+package main
+func init() {
+	select {}
+}
+func main() {
+}
+`
+
+const lockedDeadlockSource = `
+package main
+import "runtime"
+func main() {
+	runtime.LockOSThread()
+	select {}
+}
+`
+
+const lockedDeadlockSource2 = `
+package main
+import (
+	"runtime"
+	"time"
+)
+func main() {
+	go func() {
+		runtime.LockOSThread()
+		select {}
+	}()
+	time.Sleep(time.Millisecond)
+	select {}
+}
+`
+
+const goexitDeadlockSource = `
+package main
+import (
+      "runtime"
+)
+
+func F() {
+      for i := 0; i < 10; i++ {
+      }
+}
+
+func main() {
+      go F()
+      go F()
+      runtime.Goexit()
+}
+`
+
+const stackOverflowSource = `
+package main
+
+import "runtime/debug"
+
+func main() {
+	debug.SetMaxStack(4<<20)
+	f(make([]byte, 10))
+}
+
+func f(x []byte) byte {
+	var buf [64<<10]byte
+	return x[0] + f(buf[:])
+}
+`
+
+const threadExhaustionSource = `
+package main
+
+import (
+	"runtime"
+	"runtime/debug"
+)
+
+func main() {
+	debug.SetMaxThreads(10)
+	c := make(chan int)
+	for i := 0; i < 100; i++ {
+		go func() {
+			runtime.LockOSThread()
+			c <- 0
+			select{}
+		}()
+		<-c
+	}
+}
+`
+
+const recursivePanicSource = `
+package main
+
+import (
+	"fmt"
+)
+
+func main() {
+	func() {
+		defer func() {
+			fmt.Println(recover())
+		}()
+		var x [8192]byte
+		func(x [8192]byte) {
+			defer func() {
+				if err := recover(); err != nil {
+					panic("wrap: " + err.(string))
+				}
+			}()
+			panic("bad")
+		}(x)
+	}()
+	panic("again")
+}
+`
+
+const goexitExitSource = `
+package main
+
+import (
+	"runtime"
+	"time"
+)
+
+func main() {
+	go func() {
+		time.Sleep(time.Millisecond)
+	}()
+	i := 0
+	runtime.SetFinalizer(&i, func(p *int) {})
+	runtime.GC()
+	runtime.Goexit()
+}
+`
+
+const goNilSource = `
+package main
+
+func main() {
+	defer func() {
+		recover()
+	}()
+	var f func()
+	go f()
+	select{}
+}
+`
+
+const mainGoroutineIdSource = `
+package main
+func main() {
+	panic("test")
+}
+`
+
+const breakpointSource = `
+package main
+import "runtime"
+func main() {
+	runtime.Breakpoint()
+}
+`
+
+func TestGoexitInPanic(t *testing.T) {
+	// see issue 8774: this code used to trigger an infinite recursion
+	output := executeTest(t, goexitInPanicSource, nil)
+	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+const goexitInPanicSource = `
+package main
+import "runtime"
+func main() {
+	go func() {
+		defer func() {
+			runtime.Goexit()
+		}()
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+`
+
+func TestPanicAfterGoexit(t *testing.T) {
+	// an uncaught panic should still work after goexit
+	output := executeTest(t, panicAfterGoexitSource, nil)
+	want := "panic: hello"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+const panicAfterGoexitSource = `
+package main
+import "runtime"
+func main() {
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+`
+
+func TestRecoveredPanicAfterGoexit(t *testing.T) {
+	output := executeTest(t, recoveredPanicAfterGoexitSource, nil)
+	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+const recoveredPanicAfterGoexitSource = `
+package main
+import "runtime"
+func main() {
+	defer func() {
+		defer func() {
+			r := recover()
+			if r == nil {
+				panic("bad recover")
+			}
+		}()
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+`
+
+func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
+	// 1. defer a function that recovers
+	// 2. defer a function that panics
+	// 3. call goexit
+	// Goexit should run the #2 defer.  Its panic
+	// should be caught by the #1 defer, and execution
+	// should resume in the caller.  Like the Goexit
+	// never happened!
+	defer func() {
+		r := recover()
+		if r == nil {
+			panic("bad recover")
+		}
+	}()
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
new file mode 100644
index 000000000..4414dd55d
--- /dev/null
+++ b/src/runtime/debug.go
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// Breakpoint executes a breakpoint trap.
+func Breakpoint()
+
+// LockOSThread wires the calling goroutine to its current operating system thread.
+// Until the calling goroutine exits or calls UnlockOSThread, it will always
+// execute in that thread, and no other goroutine can.
+func LockOSThread()
+
+// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
+// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+func UnlockOSThread()
+
+// GOMAXPROCS sets the maximum number of CPUs that can be executing
+// simultaneously and returns the previous setting.  If n < 1, it does not
+// change the current setting.
+// The number of logical CPUs on the local machine can be queried with NumCPU.
+// This call will go away when the scheduler improves.
+func GOMAXPROCS(n int) int {
+	if n > _MaxGomaxprocs {
+		n = _MaxGomaxprocs
+	}
+	lock(&sched.lock)
+	ret := int(gomaxprocs)
+	unlock(&sched.lock)
+	if n <= 0 || n == ret {
+		return ret
+	}
+
+	semacquire(&worldsema, false)
+	gp := getg()
+	gp.m.gcing = 1
+	onM(stoptheworld)
+
+	// newprocs will be processed by starttheworld
+	newprocs = int32(n)
+
+	gp.m.gcing = 0
+	semrelease(&worldsema)
+	onM(starttheworld)
+	return ret
+}
+
+// NumCPU returns the number of logical CPUs on the local machine.
+func NumCPU() int {
+	return int(ncpu)
+}
+
+// NumCgoCall returns the number of cgo calls made by the current process.
+func NumCgoCall() int64 {
+	var n int64
+	for mp := (*m)(atomicloadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+		n += int64(mp.ncgocall)
+	}
+	return n
+}
+
+// NumGoroutine returns the number of goroutines that currently exist.
+func NumGoroutine() int {
+	return int(gcount())
+}
+
+func gcount() int32
diff --git a/src/runtime/debug/debug.s b/src/runtime/debug/debug.s
new file mode 100644
index 000000000..a7292c477
--- /dev/null
+++ b/src/runtime/debug/debug.s
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Nothing to see here.
+// This file exists so that the go command knows that parts of the
+// package are implemented in C, so that it does not instruct the
+// Go compiler to complain about extern declarations.
+// The actual implementations are in package runtime.
diff --git a/src/runtime/debug/garbage.go b/src/runtime/debug/garbage.go
new file mode 100644
index 000000000..4a77dcfcd
--- /dev/null
+++ b/src/runtime/debug/garbage.go
@@ -0,0 +1,159 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"runtime"
+	"sort"
+	"time"
+)
+
+// GCStats collect information about recent garbage collections.
+type GCStats struct {
+	LastGC         time.Time       // time of last collection
+	NumGC          int64           // number of garbage collections
+	PauseTotal     time.Duration   // total pause for all collections
+	Pause          []time.Duration // pause history, most recent first
+	PauseEnd       []time.Time     // pause end times history, most recent first
+	PauseQuantiles []time.Duration
+}
+
+// ReadGCStats reads statistics about garbage collection into stats.
+// The number of entries in the pause history is system-dependent;
+// stats.Pause slice will be reused if large enough, reallocated otherwise.
+// ReadGCStats may use the full capacity of the stats.Pause slice.
+// If stats.PauseQuantiles is non-empty, ReadGCStats fills it with quantiles
+// summarizing the distribution of pause time. For example, if
+// len(stats.PauseQuantiles) is 5, it will be filled with the minimum,
+// 25%, 50%, 75%, and maximum pause times.
+func ReadGCStats(stats *GCStats) {
+	// Create a buffer with space for at least two copies of the
+	// pause history tracked by the runtime. One will be returned
+	// to the caller and the other will be used as transfer buffer
+	// for end times history and as a temporary buffer for
+	// computing quantiles.
+	const maxPause = len(((*runtime.MemStats)(nil)).PauseNs)
+	if cap(stats.Pause) < 2*maxPause+3 {
+		stats.Pause = make([]time.Duration, 2*maxPause+3)
+	}
+
+	// readGCStats fills in the pause and end times histories (up to
+	// maxPause entries) and then three more: Unix ns time of last GC,
+	// number of GC, and total pause time in nanoseconds. Here we
+	// depend on the fact that time.Duration's native unit is
+	// nanoseconds, so the pauses and the total pause time do not need
+	// any conversion.
+	readGCStats(&stats.Pause)
+	n := len(stats.Pause) - 3
+	stats.LastGC = time.Unix(0, int64(stats.Pause[n]))
+	stats.NumGC = int64(stats.Pause[n+1])
+	stats.PauseTotal = stats.Pause[n+2]
+	n /= 2 // buffer holds pauses and end times
+	stats.Pause = stats.Pause[:n]
+
+	if cap(stats.PauseEnd) < maxPause {
+		stats.PauseEnd = make([]time.Time, 0, maxPause)
+	}
+	stats.PauseEnd = stats.PauseEnd[:0]
+	for _, ns := range stats.Pause[n : n+n] {
+		stats.PauseEnd = append(stats.PauseEnd, time.Unix(0, int64(ns)))
+	}
+
+	if len(stats.PauseQuantiles) > 0 {
+		if n == 0 {
+			for i := range stats.PauseQuantiles {
+				stats.PauseQuantiles[i] = 0
+			}
+		} else {
+			// There's room for a second copy of the data in stats.Pause.
+			// See the allocation at the top of the function.
+			sorted := stats.Pause[n : n+n]
+			copy(sorted, stats.Pause)
+			sort.Sort(byDuration(sorted))
+			nq := len(stats.PauseQuantiles) - 1
+			for i := 0; i < nq; i++ {
+				stats.PauseQuantiles[i] = sorted[len(sorted)*i/nq]
+			}
+			stats.PauseQuantiles[nq] = sorted[len(sorted)-1]
+		}
+	}
+}
+
+type byDuration []time.Duration
+
+func (x byDuration) Len() int           { return len(x) }
+func (x byDuration) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byDuration) Less(i, j int) bool { return x[i] < x[j] }
+
+// SetGCPercent sets the garbage collection target percentage:
+// a collection is triggered when the ratio of freshly allocated data
+// to live data remaining after the previous collection reaches this percentage.
+// SetGCPercent returns the previous setting.
+// The initial setting is the value of the GOGC environment variable
+// at startup, or 100 if the variable is not set.
+// A negative percentage disables garbage collection.
+func SetGCPercent(percent int) int {
+	old := setGCPercent(int32(percent))
+	runtime.GC()
+	return int(old)
+}
+
+// FreeOSMemory forces a garbage collection followed by an
+// attempt to return as much memory to the operating system
+// as possible. (Even if this is not called, the runtime gradually
+// returns memory to the operating system in a background task.)
+func FreeOSMemory() {
+	freeOSMemory()
+}
+
+// SetMaxStack sets the maximum amount of memory that
+// can be used by a single goroutine stack.
+// If any goroutine exceeds this limit while growing its stack,
+// the program crashes.
+// SetMaxStack returns the previous setting.
+// The initial setting is 1 GB on 64-bit systems, 250 MB on 32-bit systems.
+//
+// SetMaxStack is useful mainly for limiting the damage done by
+// goroutines that enter an infinite recursion. It only limits future
+// stack growth.
+func SetMaxStack(bytes int) int {
+	return setMaxStack(bytes)
+}
+
+// SetMaxThreads sets the maximum number of operating system
+// threads that the Go program can use. If it attempts to use more than
+// this many, the program crashes.
+// SetMaxThreads returns the previous setting.
+// The initial setting is 10,000 threads.
+//
+// The limit controls the number of operating system threads, not the number
+// of goroutines. A Go program creates a new thread only when a goroutine
+// is ready to run but all the existing threads are blocked in system calls, cgo calls,
+// or are locked to other goroutines due to use of runtime.LockOSThread.
+//
+// SetMaxThreads is useful mainly for limiting the damage done by
+// programs that create an unbounded number of threads. The idea is
+// to take down the program before it takes down the operating system.
+func SetMaxThreads(threads int) int {
+	return setMaxThreads(threads)
+}
+
+// SetPanicOnFault controls the runtime's behavior when a program faults
+// at an unexpected (non-nil) address. Such faults are typically caused by
+// bugs such as runtime memory corruption, so the default response is to crash
+// the program. Programs working with memory-mapped files or unsafe
+// manipulation of memory may cause faults at non-nil addresses in less
+// dramatic situations; SetPanicOnFault allows such programs to request
+// that the runtime trigger only a panic, not a crash.
+// SetPanicOnFault applies only to the current goroutine.
+// It returns the previous setting.
+func SetPanicOnFault(enabled bool) bool {
+	return setPanicOnFault(enabled)
+}
+
+// WriteHeapDump writes a description of the heap and the objects in
+// it to the given file descriptor.
+// The heap dump format is defined at http://golang.org/s/go13heapdump.
+func WriteHeapDump(fd uintptr)
diff --git a/src/runtime/debug/garbage_test.go b/src/runtime/debug/garbage_test.go
new file mode 100644
index 000000000..54c33bd4f
--- /dev/null
+++ b/src/runtime/debug/garbage_test.go
@@ -0,0 +1,115 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"runtime"
+	"testing"
+	"time"
+)
+
+func TestReadGCStats(t *testing.T) {
+	defer SetGCPercent(SetGCPercent(-1))
+
+	var stats GCStats
+	var mstats runtime.MemStats
+	var min, max time.Duration
+
+	// First ReadGCStats will allocate, second should not,
+	// especially if we follow up with an explicit garbage collection.
+	stats.PauseQuantiles = make([]time.Duration, 10)
+	ReadGCStats(&stats)
+	runtime.GC()
+
+	// Assume these will return same data: no GC during ReadGCStats.
+	ReadGCStats(&stats)
+	runtime.ReadMemStats(&mstats)
+
+	if stats.NumGC != int64(mstats.NumGC) {
+		t.Errorf("stats.NumGC = %d, but mstats.NumGC = %d", stats.NumGC, mstats.NumGC)
+	}
+	if stats.PauseTotal != time.Duration(mstats.PauseTotalNs) {
+		t.Errorf("stats.PauseTotal = %d, but mstats.PauseTotalNs = %d", stats.PauseTotal, mstats.PauseTotalNs)
+	}
+	if stats.LastGC.UnixNano() != int64(mstats.LastGC) {
+		t.Errorf("stats.LastGC.UnixNano = %d, but mstats.LastGC = %d", stats.LastGC.UnixNano(), mstats.LastGC)
+	}
+	n := int(mstats.NumGC)
+	if n > len(mstats.PauseNs) {
+		n = len(mstats.PauseNs)
+	}
+	if len(stats.Pause) != n {
+		t.Errorf("len(stats.Pause) = %d, want %d", len(stats.Pause), n)
+	} else {
+		off := (int(mstats.NumGC) + len(mstats.PauseNs) - 1) % len(mstats.PauseNs)
+		for i := 0; i < n; i++ {
+			dt := stats.Pause[i]
+			if dt != time.Duration(mstats.PauseNs[off]) {
+				t.Errorf("stats.Pause[%d] = %d, want %d", i, dt, mstats.PauseNs[off])
+			}
+			if max < dt {
+				max = dt
+			}
+			if min > dt || i == 0 {
+				min = dt
+			}
+			off = (off + len(mstats.PauseNs) - 1) % len(mstats.PauseNs)
+		}
+	}
+
+	q := stats.PauseQuantiles
+	nq := len(q)
+	if q[0] != min || q[nq-1] != max {
+		t.Errorf("stats.PauseQuantiles = [%d, ..., %d], want [%d, ..., %d]", q[0], q[nq-1], min, max)
+	}
+
+	for i := 0; i < nq-1; i++ {
+		if q[i] > q[i+1] {
+			t.Errorf("stats.PauseQuantiles[%d]=%d > stats.PauseQuantiles[%d]=%d", i, q[i], i+1, q[i+1])
+		}
+	}
+
+	// compare memory stats with gc stats:
+	if len(stats.PauseEnd) != n {
+		t.Fatalf("len(stats.PauseEnd) = %d, want %d", len(stats.PauseEnd), n)
+	}
+	off := (int(mstats.NumGC) + len(mstats.PauseEnd) - 1) % len(mstats.PauseEnd)
+	for i := 0; i < n; i++ {
+		dt := stats.PauseEnd[i]
+		if dt.UnixNano() != int64(mstats.PauseEnd[off]) {
+			t.Errorf("stats.PauseEnd[%d] = %d, want %d", i, dt, mstats.PauseEnd[off])
+		}
+		off = (off + len(mstats.PauseEnd) - 1) % len(mstats.PauseEnd)
+	}
+}
+
+var big = make([]byte, 1<<20)
+
+func TestFreeOSMemory(t *testing.T) {
+	var ms1, ms2 runtime.MemStats
+
+	if big == nil {
+		t.Skip("test is not reliable when run multiple times")
+	}
+	big = nil
+	runtime.GC()
+	runtime.ReadMemStats(&ms1)
+	FreeOSMemory()
+	runtime.ReadMemStats(&ms2)
+	if ms1.HeapReleased >= ms2.HeapReleased {
+		t.Errorf("released before=%d; released after=%d; did not go up", ms1.HeapReleased, ms2.HeapReleased)
+	}
+}
+
+func TestSetGCPercent(t *testing.T) {
+	// Test that the variable is being set and returned correctly.
+	// Assume the percentage itself is implemented fine during GC,
+	// which is harder to test.
+	old := SetGCPercent(123)
+	new := SetGCPercent(old)
+	if new != 123 {
+		t.Errorf("SetGCPercent(123); SetGCPercent(x) = %d, want 123", new)
+	}
+}
diff --git a/src/runtime/debug/heapdump_test.go b/src/runtime/debug/heapdump_test.go
new file mode 100644
index 000000000..920190115
--- /dev/null
+++ b/src/runtime/debug/heapdump_test.go
@@ -0,0 +1,33 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"io/ioutil"
+	"os"
+	"runtime"
+	"testing"
+)
+
+func TestWriteHeapDumpNonempty(t *testing.T) {
+	if runtime.GOOS == "nacl" {
+		t.Skip("WriteHeapDump is not available on NaCl.")
+	}
+	f, err := ioutil.TempFile("", "heapdumptest")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	defer os.Remove(f.Name())
+	defer f.Close()
+	WriteHeapDump(f.Fd())
+	fi, err := f.Stat()
+	if err != nil {
+		t.Fatalf("Stat failed: %v", err)
+	}
+	const minSize = 1
+	if size := fi.Size(); size < minSize {
+		t.Fatalf("Heap dump size %d bytes, expected at least %d bytes", size, minSize)
+	}
+}
diff --git a/src/runtime/debug/stack.go b/src/runtime/debug/stack.go
new file mode 100644
index 000000000..c29b0a226
--- /dev/null
+++ b/src/runtime/debug/stack.go
@@ -0,0 +1,98 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package debug contains facilities for programs to debug themselves while
+// they are running.
+package debug
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"runtime"
+)
+
+var (
+	dunno     = []byte("???")
+	centerDot = []byte("·")
+	dot       = []byte(".")
+	slash     = []byte("/")
+)
+
+// PrintStack prints to standard error the stack trace returned by Stack.
+func PrintStack() {
+	os.Stderr.Write(stack())
+}
+
+// Stack returns a formatted stack trace of the goroutine that calls it.
+// For each routine, it includes the source line information and PC value,
+// then attempts to discover, for Go functions, the calling function or
+// method and the text of the line containing the invocation.
+//
+// This function is deprecated. Use package runtime's Stack instead.
+func Stack() []byte {
+	return stack()
+}
+
+// stack implements Stack, skipping 2 frames
+func stack() []byte {
+	buf := new(bytes.Buffer) // the returned data
+	// As we loop, we open files and read them. These variables record the currently
+	// loaded file.
+	var lines [][]byte
+	var lastFile string
+	for i := 2; ; i++ { // Caller we care about is the user, 2 frames up
+		pc, file, line, ok := runtime.Caller(i)
+		if !ok {
+			break
+		}
+		// Print this much at least.  If we can't find the source, it won't show.
+		fmt.Fprintf(buf, "%s:%d (0x%x)\n", file, line, pc)
+		if file != lastFile {
+			data, err := ioutil.ReadFile(file)
+			if err != nil {
+				continue
+			}
+			lines = bytes.Split(data, []byte{'\n'})
+			lastFile = file
+		}
+		line-- // in stack trace, lines are 1-indexed but our array is 0-indexed
+		fmt.Fprintf(buf, "\t%s: %s\n", function(pc), source(lines, line))
+	}
+	return buf.Bytes()
+}
+
+// source returns a space-trimmed slice of the n'th line.
+func source(lines [][]byte, n int) []byte {
+	if n < 0 || n >= len(lines) {
+		return dunno
+	}
+	return bytes.Trim(lines[n], " \t")
+}
+
+// function returns, if possible, the name of the function containing the PC.
+func function(pc uintptr) []byte {
+	fn := runtime.FuncForPC(pc)
+	if fn == nil {
+		return dunno
+	}
+	name := []byte(fn.Name())
+	// The name includes the path name to the package, which is unnecessary
+	// since the file name is already included.  Plus, it has center dots.
+	// That is, we see
+	//	runtime/debug.*T·ptrmethod
+	// and want
+	//	*T.ptrmethod
+	// Since the package path might contains dots (e.g. code.google.com/...),
+	// we first remove the path prefix if there is one.
+	if lastslash := bytes.LastIndex(name, slash); lastslash >= 0 {
+		name = name[lastslash+1:]
+	}
+	if period := bytes.Index(name, dot); period >= 0 {
+		name = name[period+1:]
+	}
+	name = bytes.Replace(name, centerDot, dot, -1)
+	return name
+}
diff --git a/src/runtime/debug/stack_test.go b/src/runtime/debug/stack_test.go
new file mode 100644
index 000000000..28691ee4d
--- /dev/null
+++ b/src/runtime/debug/stack_test.go
@@ -0,0 +1,62 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"strings"
+	"testing"
+)
+
+type T int
+
+func (t *T) ptrmethod() []byte {
+	return Stack()
+}
+func (t T) method() []byte {
+	return t.ptrmethod()
+}
+
+/*
+	The traceback should look something like this, modulo line numbers and hex constants.
+	Don't worry much about the base levels, but check the ones in our own package.
+
+		/Users/r/go/src/runtime/debug/stack_test.go:15 (0x13878)
+			(*T).ptrmethod: return Stack()
+		/Users/r/go/src/runtime/debug/stack_test.go:18 (0x138dd)
+			T.method: return t.ptrmethod()
+		/Users/r/go/src/runtime/debug/stack_test.go:23 (0x13920)
+			TestStack: b := T(0).method()
+		/Users/r/go/src/testing/testing.go:132 (0x14a7a)
+			tRunner: test.F(t)
+		/Users/r/go/src/runtime/proc.c:145 (0xc970)
+			???: runtime·unlock(&runtime·sched);
+*/
+func TestStack(t *testing.T) {
+	b := T(0).method()
+	lines := strings.Split(string(b), "\n")
+	if len(lines) < 6 {
+		t.Fatal("too few lines")
+	}
+	n := 0
+	frame := func(line, code string) {
+		check(t, lines[n], line)
+		n++
+		// The source might not be available while running the test.
+		if strings.HasPrefix(lines[n], "\t") {
+			check(t, lines[n], code)
+			n++
+		}
+	}
+	frame("src/runtime/debug/stack_test.go", "\t(*T).ptrmethod: return Stack()")
+	frame("src/runtime/debug/stack_test.go", "\tT.method: return t.ptrmethod()")
+	frame("src/runtime/debug/stack_test.go", "\tTestStack: b := T(0).method()")
+	frame("src/testing/testing.go", "")
+}
+
+func check(t *testing.T, line, has string) {
+	if strings.Index(line, has) < 0 {
+		t.Errorf("expected %q in %q", has, line)
+	}
+}
diff --git a/src/runtime/debug/stubs.go b/src/runtime/debug/stubs.go
new file mode 100644
index 000000000..8fba6cf34
--- /dev/null
+++ b/src/runtime/debug/stubs.go
@@ -0,0 +1,20 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"time"
+)
+
+// Uses assembly to call corresponding runtime-internal functions.
+func setMaxStack(int) int
+func setGCPercent(int32) int32
+func setPanicOnFault(bool) bool
+func setMaxThreads(int) int
+
+// Implemented in package runtime.
+func readGCStats(*[]time.Duration)
+func enableGC(bool) bool
+func freeOSMemory()
diff --git a/src/runtime/debug/stubs.s b/src/runtime/debug/stubs.s
new file mode 100644
index 000000000..d56274f2d
--- /dev/null
+++ b/src/runtime/debug/stubs.s
@@ -0,0 +1,21 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#ifdef GOARCH_arm
+#define JMP B
+#endif
+
+TEXT ·setMaxStack(SB),NOSPLIT,$0-0
+  JMP runtime·setMaxStack(SB)
+
+TEXT ·setGCPercent(SB),NOSPLIT,$0-0
+  JMP runtime·setGCPercent(SB)
+
+TEXT ·setPanicOnFault(SB),NOSPLIT,$0-0
+  JMP runtime·setPanicOnFault(SB)
+
+TEXT ·setMaxThreads(SB),NOSPLIT,$0-0
+  JMP runtime·setMaxThreads(SB)
diff --git a/src/runtime/defs.c b/src/runtime/defs.c
new file mode 100644
index 000000000..b0a9b20d7
--- /dev/null
+++ b/src/runtime/defs.c
@@ -0,0 +1,15 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is compiled by cmd/dist to obtain debug information
+// about the given header files.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "type.h"
+#include "race.h"
+#include "chan.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
diff --git a/src/runtime/defs1_linux.go b/src/runtime/defs1_linux.go
new file mode 100644
index 000000000..392cc4ab5
--- /dev/null
+++ b/src/runtime/defs1_linux.go
@@ -0,0 +1,37 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo -cdefs
+
+GOARCH=amd64 cgo -cdefs defs.go defs1.go >amd64/defs.h
+*/
+
+package runtime
+
+/*
+#include <ucontext.h>
+#include <fcntl.h>
+*/
+import "C"
+
+const (
+	O_RDONLY  = C.O_RDONLY
+	O_CLOEXEC = C.O_CLOEXEC
+)
+
+type Usigset C.__sigset_t
+type Fpxreg C.struct__libc_fpxreg
+type Xmmreg C.struct__libc_xmmreg
+type Fpstate C.struct__libc_fpstate
+type Fpxreg1 C.struct__fpxreg
+type Xmmreg1 C.struct__xmmreg
+type Fpstate1 C.struct__fpstate
+type Fpreg1 C.struct__fpreg
+type SigaltstackT C.struct_sigaltstack
+type Mcontext C.mcontext_t
+type Ucontext C.ucontext_t
+type Sigcontext C.struct_sigcontext
diff --git a/src/runtime/defs2_linux.go b/src/runtime/defs2_linux.go
new file mode 100644
index 000000000..980df9ec3
--- /dev/null
+++ b/src/runtime/defs2_linux.go
@@ -0,0 +1,146 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+ * Input to cgo -cdefs
+
+GOARCH=386 go tool cgo -cdefs defs2_linux.go >defs_linux_386.h
+
+The asm header tricks we have to use for Linux on amd64
+(see defs.c and defs1.c) don't work here, so this is yet another
+file.  Sigh.
+*/
+
+package runtime
+
+/*
+#cgo CFLAGS: -I/tmp/linux/arch/x86/include -I/tmp/linux/include -D_LOOSE_KERNEL_NAMES -D__ARCH_SI_UID_T=__kernel_uid32_t
+
+#define size_t __kernel_size_t
+#define pid_t int
+#include <asm/signal.h>
+#include <asm/mman.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/siginfo.h>
+#include <asm-generic/errno.h>
+#include <asm-generic/fcntl.h>
+#include <asm-generic/poll.h>
+#include <linux/eventpoll.h>
+
+// This is the sigaction structure from the Linux 2.1.68 kernel which
+//   is used with the rt_sigaction system call.  For 386 this is not
+//   defined in any public header file.
+
+struct kernel_sigaction {
+	__sighandler_t k_sa_handler;
+	unsigned long sa_flags;
+	void (*sa_restorer) (void);
+	unsigned long long sa_mask;
+};
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EAGAIN = C.EAGAIN
+	ENOMEM = C.ENOMEM
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANONYMOUS
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_DONTNEED = C.MADV_DONTNEED
+
+	SA_RESTART  = C.SA_RESTART
+	SA_ONSTACK  = C.SA_ONSTACK
+	SA_RESTORER = C.SA_RESTORER
+	SA_SIGINFO  = C.SA_SIGINFO
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGBUS    = C.SIGBUS
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGUSR1   = C.SIGUSR1
+	SIGSEGV   = C.SIGSEGV
+	SIGUSR2   = C.SIGUSR2
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGSTKFLT = C.SIGSTKFLT
+	SIGCHLD   = C.SIGCHLD
+	SIGCONT   = C.SIGCONT
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGURG    = C.SIGURG
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGIO     = C.SIGIO
+	SIGPWR    = C.SIGPWR
+	SIGSYS    = C.SIGSYS
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	O_RDONLY  = C.O_RDONLY
+	O_CLOEXEC = C.O_CLOEXEC
+
+	EPOLLIN       = C.POLLIN
+	EPOLLOUT      = C.POLLOUT
+	EPOLLERR      = C.POLLERR
+	EPOLLHUP      = C.POLLHUP
+	EPOLLRDHUP    = C.POLLRDHUP
+	EPOLLET       = C.EPOLLET
+	EPOLL_CLOEXEC = C.EPOLL_CLOEXEC
+	EPOLL_CTL_ADD = C.EPOLL_CTL_ADD
+	EPOLL_CTL_DEL = C.EPOLL_CTL_DEL
+	EPOLL_CTL_MOD = C.EPOLL_CTL_MOD
+)
+
+type Fpreg C.struct__fpreg
+type Fpxreg C.struct__fpxreg
+type Xmmreg C.struct__xmmreg
+type Fpstate C.struct__fpstate
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Sigaction C.struct_kernel_sigaction
+type Siginfo C.siginfo_t
+type SigaltstackT C.struct_sigaltstack
+type Sigcontext C.struct_sigcontext
+type Ucontext C.struct_ucontext
+type Itimerval C.struct_itimerval
+type EpollEvent C.struct_epoll_event
diff --git a/src/runtime/defs_android_arm.h b/src/runtime/defs_android_arm.h
new file mode 100644
index 000000000..3611b3a10
--- /dev/null
+++ b/src/runtime/defs_android_arm.h
@@ -0,0 +1,3 @@
+// TODO: Generate using cgo like defs_linux_{386,amd64}.h
+
+#include "defs_linux_arm.h"
diff --git a/src/runtime/defs_arm_linux.go b/src/runtime/defs_arm_linux.go
new file mode 100644
index 000000000..afd6897e3
--- /dev/null
+++ b/src/runtime/defs_arm_linux.go
@@ -0,0 +1,124 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+On a Debian Lenny arm linux distribution:
+
+cgo -cdefs defs_arm.c >arm/defs.h
+*/
+
+package runtime
+
+/*
+#cgo CFLAGS: -I/usr/src/linux-headers-2.6.26-2-versatile/include
+
+#define __ARCH_SI_UID_T int
+#include <asm/signal.h>
+#include <asm/mman.h>
+#include <asm/sigcontext.h>
+#include <asm/ucontext.h>
+#include <asm/siginfo.h>
+#include <linux/time.h>
+
+struct xsiginfo {
+	int si_signo;
+	int si_errno;
+	int si_code;
+	char _sifields[4];
+};
+
+#undef sa_handler
+#undef sa_flags
+#undef sa_restorer
+#undef sa_mask
+
+struct xsigaction {
+	void (*sa_handler)(void);
+	unsigned long sa_flags;
+	void (*sa_restorer)(void);
+	unsigned int sa_mask;		// mask last for extensibility
+};
+*/
+import "C"
+
+const (
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANONYMOUS
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_DONTNEED = C.MADV_DONTNEED
+
+	SA_RESTART  = C.SA_RESTART
+	SA_ONSTACK  = C.SA_ONSTACK
+	SA_RESTORER = C.SA_RESTORER
+	SA_SIGINFO  = C.SA_SIGINFO
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGBUS    = C.SIGBUS
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGUSR1   = C.SIGUSR1
+	SIGSEGV   = C.SIGSEGV
+	SIGUSR2   = C.SIGUSR2
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGSTKFLT = C.SIGSTKFLT
+	SIGCHLD   = C.SIGCHLD
+	SIGCONT   = C.SIGCONT
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGURG    = C.SIGURG
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGIO     = C.SIGIO
+	SIGPWR    = C.SIGPWR
+	SIGSYS    = C.SIGSYS
+
+	FPE_INTDIV = C.FPE_INTDIV & 0xFFFF
+	FPE_INTOVF = C.FPE_INTOVF & 0xFFFF
+	FPE_FLTDIV = C.FPE_FLTDIV & 0xFFFF
+	FPE_FLTOVF = C.FPE_FLTOVF & 0xFFFF
+	FPE_FLTUND = C.FPE_FLTUND & 0xFFFF
+	FPE_FLTRES = C.FPE_FLTRES & 0xFFFF
+	FPE_FLTINV = C.FPE_FLTINV & 0xFFFF
+	FPE_FLTSUB = C.FPE_FLTSUB & 0xFFFF
+
+	BUS_ADRALN = C.BUS_ADRALN & 0xFFFF
+	BUS_ADRERR = C.BUS_ADRERR & 0xFFFF
+	BUS_OBJERR = C.BUS_OBJERR & 0xFFFF
+
+	SEGV_MAPERR = C.SEGV_MAPERR & 0xFFFF
+	SEGV_ACCERR = C.SEGV_ACCERR & 0xFFFF
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_PROF    = C.ITIMER_PROF
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+)
+
+type Timespec C.struct_timespec
+type SigaltstackT C.struct_sigaltstack
+type Sigcontext C.struct_sigcontext
+type Ucontext C.struct_ucontext
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+type Siginfo C.struct_xsiginfo
+type Sigaction C.struct_xsigaction
diff --git a/src/runtime/defs_darwin.go b/src/runtime/defs_darwin.go
new file mode 100644
index 000000000..722013ba9
--- /dev/null
+++ b/src/runtime/defs_darwin.go
@@ -0,0 +1,179 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_darwin.go >defs_darwin_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_darwin.go >defs_darwin_386.h
+*/
+
+package runtime
+
+/*
+#define __DARWIN_UNIX03 0
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/event.h>
+#include <sys/mman.h>
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EFAULT = C.EFAULT
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANON
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_DONTNEED = C.MADV_DONTNEED
+	MADV_FREE     = C.MADV_FREE
+
+	MACH_MSG_TYPE_MOVE_RECEIVE   = C.MACH_MSG_TYPE_MOVE_RECEIVE
+	MACH_MSG_TYPE_MOVE_SEND      = C.MACH_MSG_TYPE_MOVE_SEND
+	MACH_MSG_TYPE_MOVE_SEND_ONCE = C.MACH_MSG_TYPE_MOVE_SEND_ONCE
+	MACH_MSG_TYPE_COPY_SEND      = C.MACH_MSG_TYPE_COPY_SEND
+	MACH_MSG_TYPE_MAKE_SEND      = C.MACH_MSG_TYPE_MAKE_SEND
+	MACH_MSG_TYPE_MAKE_SEND_ONCE = C.MACH_MSG_TYPE_MAKE_SEND_ONCE
+	MACH_MSG_TYPE_COPY_RECEIVE   = C.MACH_MSG_TYPE_COPY_RECEIVE
+
+	MACH_MSG_PORT_DESCRIPTOR         = C.MACH_MSG_PORT_DESCRIPTOR
+	MACH_MSG_OOL_DESCRIPTOR          = C.MACH_MSG_OOL_DESCRIPTOR
+	MACH_MSG_OOL_PORTS_DESCRIPTOR    = C.MACH_MSG_OOL_PORTS_DESCRIPTOR
+	MACH_MSG_OOL_VOLATILE_DESCRIPTOR = C.MACH_MSG_OOL_VOLATILE_DESCRIPTOR
+
+	MACH_MSGH_BITS_COMPLEX = C.MACH_MSGH_BITS_COMPLEX
+
+	MACH_SEND_MSG  = C.MACH_SEND_MSG
+	MACH_RCV_MSG   = C.MACH_RCV_MSG
+	MACH_RCV_LARGE = C.MACH_RCV_LARGE
+
+	MACH_SEND_TIMEOUT   = C.MACH_SEND_TIMEOUT
+	MACH_SEND_INTERRUPT = C.MACH_SEND_INTERRUPT
+	MACH_SEND_ALWAYS    = C.MACH_SEND_ALWAYS
+	MACH_SEND_TRAILER   = C.MACH_SEND_TRAILER
+	MACH_RCV_TIMEOUT    = C.MACH_RCV_TIMEOUT
+	MACH_RCV_NOTIFY     = C.MACH_RCV_NOTIFY
+	MACH_RCV_INTERRUPT  = C.MACH_RCV_INTERRUPT
+	MACH_RCV_OVERWRITE  = C.MACH_RCV_OVERWRITE
+
+	NDR_PROTOCOL_2_0      = C.NDR_PROTOCOL_2_0
+	NDR_INT_BIG_ENDIAN    = C.NDR_INT_BIG_ENDIAN
+	NDR_INT_LITTLE_ENDIAN = C.NDR_INT_LITTLE_ENDIAN
+	NDR_FLOAT_IEEE        = C.NDR_FLOAT_IEEE
+	NDR_CHAR_ASCII        = C.NDR_CHAR_ASCII
+
+	SA_SIGINFO   = C.SA_SIGINFO
+	SA_RESTART   = C.SA_RESTART
+	SA_ONSTACK   = C.SA_ONSTACK
+	SA_USERTRAMP = C.SA_USERTRAMP
+	SA_64REGSET  = C.SA_64REGSET
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGEMT    = C.SIGEMT
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGBUS    = C.SIGBUS
+	SIGSEGV   = C.SIGSEGV
+	SIGSYS    = C.SIGSYS
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGTERM   = C.SIGTERM
+	SIGURG    = C.SIGURG
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGCONT   = C.SIGCONT
+	SIGCHLD   = C.SIGCHLD
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGIO     = C.SIGIO
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGINFO   = C.SIGINFO
+	SIGUSR1   = C.SIGUSR1
+	SIGUSR2   = C.SIGUSR2
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	EV_ADD       = C.EV_ADD
+	EV_DELETE    = C.EV_DELETE
+	EV_CLEAR     = C.EV_CLEAR
+	EV_RECEIPT   = C.EV_RECEIPT
+	EV_ERROR     = C.EV_ERROR
+	EVFILT_READ  = C.EVFILT_READ
+	EVFILT_WRITE = C.EVFILT_WRITE
+)
+
+type MachBody C.mach_msg_body_t
+type MachHeader C.mach_msg_header_t
+type MachNDR C.NDR_record_t
+type MachPort C.mach_msg_port_descriptor_t
+
+type StackT C.struct_sigaltstack
+type Sighandler C.union___sigaction_u
+
+type Sigaction C.struct___sigaction // used in syscalls
+// type Sigaction C.struct_sigaction	// used by the C library
+type Sigval C.union_sigval
+type Siginfo C.siginfo_t
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+type Timespec C.struct_timespec
+
+type FPControl C.struct_fp_control
+type FPStatus C.struct_fp_status
+type RegMMST C.struct_mmst_reg
+type RegXMM C.struct_xmm_reg
+
+type Regs64 C.struct_x86_thread_state64
+type FloatState64 C.struct_x86_float_state64
+type ExceptionState64 C.struct_x86_exception_state64
+type Mcontext64 C.struct_mcontext64
+
+type Regs32 C.struct_i386_thread_state
+type FloatState32 C.struct_i386_float_state
+type ExceptionState32 C.struct_i386_exception_state
+type Mcontext32 C.struct_mcontext32
+
+type Ucontext C.struct_ucontext
+
+type Kevent C.struct_kevent
diff --git a/src/runtime/defs_darwin_386.h b/src/runtime/defs_darwin_386.h
new file mode 100644
index 000000000..0e0b4fbf7
--- /dev/null
+++ b/src/runtime/defs_darwin_386.h
@@ -0,0 +1,392 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_darwin.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_DONTNEED	= 0x4,
+	MADV_FREE	= 0x5,
+
+	MACH_MSG_TYPE_MOVE_RECEIVE	= 0x10,
+	MACH_MSG_TYPE_MOVE_SEND		= 0x11,
+	MACH_MSG_TYPE_MOVE_SEND_ONCE	= 0x12,
+	MACH_MSG_TYPE_COPY_SEND		= 0x13,
+	MACH_MSG_TYPE_MAKE_SEND		= 0x14,
+	MACH_MSG_TYPE_MAKE_SEND_ONCE	= 0x15,
+	MACH_MSG_TYPE_COPY_RECEIVE	= 0x16,
+
+	MACH_MSG_PORT_DESCRIPTOR		= 0x0,
+	MACH_MSG_OOL_DESCRIPTOR			= 0x1,
+	MACH_MSG_OOL_PORTS_DESCRIPTOR		= 0x2,
+	MACH_MSG_OOL_VOLATILE_DESCRIPTOR	= 0x3,
+
+	MACH_MSGH_BITS_COMPLEX	= 0x80000000,
+
+	MACH_SEND_MSG	= 0x1,
+	MACH_RCV_MSG	= 0x2,
+	MACH_RCV_LARGE	= 0x4,
+
+	MACH_SEND_TIMEOUT	= 0x10,
+	MACH_SEND_INTERRUPT	= 0x40,
+	MACH_SEND_ALWAYS	= 0x10000,
+	MACH_SEND_TRAILER	= 0x20000,
+	MACH_RCV_TIMEOUT	= 0x100,
+	MACH_RCV_NOTIFY		= 0x200,
+	MACH_RCV_INTERRUPT	= 0x400,
+	MACH_RCV_OVERWRITE	= 0x1000,
+
+	NDR_PROTOCOL_2_0	= 0x0,
+	NDR_INT_BIG_ENDIAN	= 0x0,
+	NDR_INT_LITTLE_ENDIAN	= 0x1,
+	NDR_FLOAT_IEEE		= 0x0,
+	NDR_CHAR_ASCII		= 0x0,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+	SA_USERTRAMP	= 0x100,
+	SA_64REGSET	= 0x200,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x7,
+	FPE_INTOVF	= 0x8,
+	FPE_FLTDIV	= 0x1,
+	FPE_FLTOVF	= 0x2,
+	FPE_FLTUND	= 0x3,
+	FPE_FLTRES	= 0x4,
+	FPE_FLTINV	= 0x5,
+	FPE_FLTSUB	= 0x6,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0x40,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct MachBody MachBody;
+typedef struct MachHeader MachHeader;
+typedef struct MachNDR MachNDR;
+typedef struct MachPort MachPort;
+typedef struct StackT StackT;
+typedef struct SigactionT SigactionT;
+typedef struct Siginfo Siginfo;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct Timespec Timespec;
+typedef struct FPControl FPControl;
+typedef struct FPStatus FPStatus;
+typedef struct RegMMST RegMMST;
+typedef struct RegXMM RegXMM;
+typedef struct Regs64 Regs64;
+typedef struct FloatState64 FloatState64;
+typedef struct ExceptionState64 ExceptionState64;
+typedef struct Mcontext64 Mcontext64;
+typedef struct Regs32 Regs32;
+typedef struct FloatState32 FloatState32;
+typedef struct ExceptionState32 ExceptionState32;
+typedef struct Mcontext32 Mcontext32;
+typedef struct Ucontext Ucontext;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct MachBody {
+	uint32	msgh_descriptor_count;
+};
+struct MachHeader {
+	uint32	msgh_bits;
+	uint32	msgh_size;
+	uint32	msgh_remote_port;
+	uint32	msgh_local_port;
+	uint32	msgh_reserved;
+	int32	msgh_id;
+};
+struct MachNDR {
+	uint8	mig_vers;
+	uint8	if_vers;
+	uint8	reserved1;
+	uint8	mig_encoding;
+	uint8	int_rep;
+	uint8	char_rep;
+	uint8	float_rep;
+	uint8	reserved2;
+};
+struct MachPort {
+	uint32	name;
+	uint32	pad1;
+	uint16	pad2;
+	uint8	disposition;
+	uint8	type;
+};
+
+struct StackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+typedef	byte	Sighandler[4];
+
+struct SigactionT {
+	byte	__sigaction_u[4];
+	void	*sa_tramp;
+	uint32	sa_mask;
+	int32	sa_flags;
+};
+
+typedef	byte	Sigval[4];
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[4];
+	int32	si_band;
+	uint32	__pad[7];
+};
+struct Timeval {
+	int32	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+struct Timespec {
+	int32	tv_sec;
+	int32	tv_nsec;
+};
+
+struct FPControl {
+	byte	Pad_cgo_0[2];
+};
+struct FPStatus {
+	byte	Pad_cgo_0[2];
+};
+struct RegMMST {
+	int8	mmst_reg[10];
+	int8	mmst_rsrv[6];
+};
+struct RegXMM {
+	int8	xmm_reg[16];
+};
+
+struct Regs64 {
+	uint64	rax;
+	uint64	rbx;
+	uint64	rcx;
+	uint64	rdx;
+	uint64	rdi;
+	uint64	rsi;
+	uint64	rbp;
+	uint64	rsp;
+	uint64	r8;
+	uint64	r9;
+	uint64	r10;
+	uint64	r11;
+	uint64	r12;
+	uint64	r13;
+	uint64	r14;
+	uint64	r15;
+	uint64	rip;
+	uint64	rflags;
+	uint64	cs;
+	uint64	fs;
+	uint64	gs;
+};
+struct FloatState64 {
+	int32	fpu_reserved[2];
+	FPControl	fpu_fcw;
+	FPStatus	fpu_fsw;
+	uint8	fpu_ftw;
+	uint8	fpu_rsrv1;
+	uint16	fpu_fop;
+	uint32	fpu_ip;
+	uint16	fpu_cs;
+	uint16	fpu_rsrv2;
+	uint32	fpu_dp;
+	uint16	fpu_ds;
+	uint16	fpu_rsrv3;
+	uint32	fpu_mxcsr;
+	uint32	fpu_mxcsrmask;
+	RegMMST	fpu_stmm0;
+	RegMMST	fpu_stmm1;
+	RegMMST	fpu_stmm2;
+	RegMMST	fpu_stmm3;
+	RegMMST	fpu_stmm4;
+	RegMMST	fpu_stmm5;
+	RegMMST	fpu_stmm6;
+	RegMMST	fpu_stmm7;
+	RegXMM	fpu_xmm0;
+	RegXMM	fpu_xmm1;
+	RegXMM	fpu_xmm2;
+	RegXMM	fpu_xmm3;
+	RegXMM	fpu_xmm4;
+	RegXMM	fpu_xmm5;
+	RegXMM	fpu_xmm6;
+	RegXMM	fpu_xmm7;
+	RegXMM	fpu_xmm8;
+	RegXMM	fpu_xmm9;
+	RegXMM	fpu_xmm10;
+	RegXMM	fpu_xmm11;
+	RegXMM	fpu_xmm12;
+	RegXMM	fpu_xmm13;
+	RegXMM	fpu_xmm14;
+	RegXMM	fpu_xmm15;
+	int8	fpu_rsrv4[96];
+	int32	fpu_reserved1;
+};
+struct ExceptionState64 {
+	uint16	trapno;
+	uint16	cpu;
+	uint32	err;
+	uint64	faultvaddr;
+};
+struct Mcontext64 {
+	ExceptionState64	es;
+	Regs64	ss;
+	FloatState64	fs;
+};
+
+struct Regs32 {
+	uint32	eax;
+	uint32	ebx;
+	uint32	ecx;
+	uint32	edx;
+	uint32	edi;
+	uint32	esi;
+	uint32	ebp;
+	uint32	esp;
+	uint32	ss;
+	uint32	eflags;
+	uint32	eip;
+	uint32	cs;
+	uint32	ds;
+	uint32	es;
+	uint32	fs;
+	uint32	gs;
+};
+struct FloatState32 {
+	int32	fpu_reserved[2];
+	FPControl	fpu_fcw;
+	FPStatus	fpu_fsw;
+	uint8	fpu_ftw;
+	uint8	fpu_rsrv1;
+	uint16	fpu_fop;
+	uint32	fpu_ip;
+	uint16	fpu_cs;
+	uint16	fpu_rsrv2;
+	uint32	fpu_dp;
+	uint16	fpu_ds;
+	uint16	fpu_rsrv3;
+	uint32	fpu_mxcsr;
+	uint32	fpu_mxcsrmask;
+	RegMMST	fpu_stmm0;
+	RegMMST	fpu_stmm1;
+	RegMMST	fpu_stmm2;
+	RegMMST	fpu_stmm3;
+	RegMMST	fpu_stmm4;
+	RegMMST	fpu_stmm5;
+	RegMMST	fpu_stmm6;
+	RegMMST	fpu_stmm7;
+	RegXMM	fpu_xmm0;
+	RegXMM	fpu_xmm1;
+	RegXMM	fpu_xmm2;
+	RegXMM	fpu_xmm3;
+	RegXMM	fpu_xmm4;
+	RegXMM	fpu_xmm5;
+	RegXMM	fpu_xmm6;
+	RegXMM	fpu_xmm7;
+	int8	fpu_rsrv4[224];
+	int32	fpu_reserved1;
+};
+struct ExceptionState32 {
+	uint16	trapno;
+	uint16	cpu;
+	uint32	err;
+	uint32	faultvaddr;
+};
+struct Mcontext32 {
+	ExceptionState32	es;
+	Regs32	ss;
+	FloatState32	fs;
+};
+
+struct Ucontext {
+	int32	uc_onstack;
+	uint32	uc_sigmask;
+	StackT	uc_stack;
+	Ucontext	*uc_link;
+	uint32	uc_mcsize;
+	Mcontext32	*uc_mcontext;
+};
+
+struct KeventT {
+	uint32	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int32	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_darwin_amd64.h b/src/runtime/defs_darwin_amd64.h
new file mode 100644
index 000000000..4bf83c1cb
--- /dev/null
+++ b/src/runtime/defs_darwin_amd64.h
@@ -0,0 +1,395 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_darwin.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_DONTNEED	= 0x4,
+	MADV_FREE	= 0x5,
+
+	MACH_MSG_TYPE_MOVE_RECEIVE	= 0x10,
+	MACH_MSG_TYPE_MOVE_SEND		= 0x11,
+	MACH_MSG_TYPE_MOVE_SEND_ONCE	= 0x12,
+	MACH_MSG_TYPE_COPY_SEND		= 0x13,
+	MACH_MSG_TYPE_MAKE_SEND		= 0x14,
+	MACH_MSG_TYPE_MAKE_SEND_ONCE	= 0x15,
+	MACH_MSG_TYPE_COPY_RECEIVE	= 0x16,
+
+	MACH_MSG_PORT_DESCRIPTOR		= 0x0,
+	MACH_MSG_OOL_DESCRIPTOR			= 0x1,
+	MACH_MSG_OOL_PORTS_DESCRIPTOR		= 0x2,
+	MACH_MSG_OOL_VOLATILE_DESCRIPTOR	= 0x3,
+
+	MACH_MSGH_BITS_COMPLEX	= 0x80000000,
+
+	MACH_SEND_MSG	= 0x1,
+	MACH_RCV_MSG	= 0x2,
+	MACH_RCV_LARGE	= 0x4,
+
+	MACH_SEND_TIMEOUT	= 0x10,
+	MACH_SEND_INTERRUPT	= 0x40,
+	MACH_SEND_ALWAYS	= 0x10000,
+	MACH_SEND_TRAILER	= 0x20000,
+	MACH_RCV_TIMEOUT	= 0x100,
+	MACH_RCV_NOTIFY		= 0x200,
+	MACH_RCV_INTERRUPT	= 0x400,
+	MACH_RCV_OVERWRITE	= 0x1000,
+
+	NDR_PROTOCOL_2_0	= 0x0,
+	NDR_INT_BIG_ENDIAN	= 0x0,
+	NDR_INT_LITTLE_ENDIAN	= 0x1,
+	NDR_FLOAT_IEEE		= 0x0,
+	NDR_CHAR_ASCII		= 0x0,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+	SA_USERTRAMP	= 0x100,
+	SA_64REGSET	= 0x200,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x7,
+	FPE_INTOVF	= 0x8,
+	FPE_FLTDIV	= 0x1,
+	FPE_FLTOVF	= 0x2,
+	FPE_FLTUND	= 0x3,
+	FPE_FLTRES	= 0x4,
+	FPE_FLTINV	= 0x5,
+	FPE_FLTSUB	= 0x6,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0x40,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct MachBody MachBody;
+typedef struct MachHeader MachHeader;
+typedef struct MachNDR MachNDR;
+typedef struct MachPort MachPort;
+typedef struct StackT StackT;
+typedef struct SigactionT SigactionT;
+typedef struct Siginfo Siginfo;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct Timespec Timespec;
+typedef struct FPControl FPControl;
+typedef struct FPStatus FPStatus;
+typedef struct RegMMST RegMMST;
+typedef struct RegXMM RegXMM;
+typedef struct Regs64 Regs64;
+typedef struct FloatState64 FloatState64;
+typedef struct ExceptionState64 ExceptionState64;
+typedef struct Mcontext64 Mcontext64;
+typedef struct Regs32 Regs32;
+typedef struct FloatState32 FloatState32;
+typedef struct ExceptionState32 ExceptionState32;
+typedef struct Mcontext32 Mcontext32;
+typedef struct Ucontext Ucontext;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct MachBody {
+	uint32	msgh_descriptor_count;
+};
+struct MachHeader {
+	uint32	msgh_bits;
+	uint32	msgh_size;
+	uint32	msgh_remote_port;
+	uint32	msgh_local_port;
+	uint32	msgh_reserved;
+	int32	msgh_id;
+};
+struct MachNDR {
+	uint8	mig_vers;
+	uint8	if_vers;
+	uint8	reserved1;
+	uint8	mig_encoding;
+	uint8	int_rep;
+	uint8	char_rep;
+	uint8	float_rep;
+	uint8	reserved2;
+};
+struct MachPort {
+	uint32	name;
+	uint32	pad1;
+	uint16	pad2;
+	uint8	disposition;
+	uint8	type;
+};
+
+struct StackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+typedef	byte	Sighandler[8];
+
+struct SigactionT {
+	byte	__sigaction_u[8];
+	void	*sa_tramp;
+	uint32	sa_mask;
+	int32	sa_flags;
+};
+
+typedef	byte	Sigval[8];
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[8];
+	int64	si_band;
+	uint64	__pad[7];
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+	byte	Pad_cgo_0[4];
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+
+struct FPControl {
+	byte	Pad_cgo_0[2];
+};
+struct FPStatus {
+	byte	Pad_cgo_0[2];
+};
+struct RegMMST {
+	int8	mmst_reg[10];
+	int8	mmst_rsrv[6];
+};
+struct RegXMM {
+	int8	xmm_reg[16];
+};
+
+struct Regs64 {
+	uint64	rax;
+	uint64	rbx;
+	uint64	rcx;
+	uint64	rdx;
+	uint64	rdi;
+	uint64	rsi;
+	uint64	rbp;
+	uint64	rsp;
+	uint64	r8;
+	uint64	r9;
+	uint64	r10;
+	uint64	r11;
+	uint64	r12;
+	uint64	r13;
+	uint64	r14;
+	uint64	r15;
+	uint64	rip;
+	uint64	rflags;
+	uint64	cs;
+	uint64	fs;
+	uint64	gs;
+};
+struct FloatState64 {
+	int32	fpu_reserved[2];
+	FPControl	fpu_fcw;
+	FPStatus	fpu_fsw;
+	uint8	fpu_ftw;
+	uint8	fpu_rsrv1;
+	uint16	fpu_fop;
+	uint32	fpu_ip;
+	uint16	fpu_cs;
+	uint16	fpu_rsrv2;
+	uint32	fpu_dp;
+	uint16	fpu_ds;
+	uint16	fpu_rsrv3;
+	uint32	fpu_mxcsr;
+	uint32	fpu_mxcsrmask;
+	RegMMST	fpu_stmm0;
+	RegMMST	fpu_stmm1;
+	RegMMST	fpu_stmm2;
+	RegMMST	fpu_stmm3;
+	RegMMST	fpu_stmm4;
+	RegMMST	fpu_stmm5;
+	RegMMST	fpu_stmm6;
+	RegMMST	fpu_stmm7;
+	RegXMM	fpu_xmm0;
+	RegXMM	fpu_xmm1;
+	RegXMM	fpu_xmm2;
+	RegXMM	fpu_xmm3;
+	RegXMM	fpu_xmm4;
+	RegXMM	fpu_xmm5;
+	RegXMM	fpu_xmm6;
+	RegXMM	fpu_xmm7;
+	RegXMM	fpu_xmm8;
+	RegXMM	fpu_xmm9;
+	RegXMM	fpu_xmm10;
+	RegXMM	fpu_xmm11;
+	RegXMM	fpu_xmm12;
+	RegXMM	fpu_xmm13;
+	RegXMM	fpu_xmm14;
+	RegXMM	fpu_xmm15;
+	int8	fpu_rsrv4[96];
+	int32	fpu_reserved1;
+};
+struct ExceptionState64 {
+	uint16	trapno;
+	uint16	cpu;
+	uint32	err;
+	uint64	faultvaddr;
+};
+struct Mcontext64 {
+	ExceptionState64	es;
+	Regs64	ss;
+	FloatState64	fs;
+	byte	Pad_cgo_0[4];
+};
+
+struct Regs32 {
+	uint32	eax;
+	uint32	ebx;
+	uint32	ecx;
+	uint32	edx;
+	uint32	edi;
+	uint32	esi;
+	uint32	ebp;
+	uint32	esp;
+	uint32	ss;
+	uint32	eflags;
+	uint32	eip;
+	uint32	cs;
+	uint32	ds;
+	uint32	es;
+	uint32	fs;
+	uint32	gs;
+};
+struct FloatState32 {
+	int32	fpu_reserved[2];
+	FPControl	fpu_fcw;
+	FPStatus	fpu_fsw;
+	uint8	fpu_ftw;
+	uint8	fpu_rsrv1;
+	uint16	fpu_fop;
+	uint32	fpu_ip;
+	uint16	fpu_cs;
+	uint16	fpu_rsrv2;
+	uint32	fpu_dp;
+	uint16	fpu_ds;
+	uint16	fpu_rsrv3;
+	uint32	fpu_mxcsr;
+	uint32	fpu_mxcsrmask;
+	RegMMST	fpu_stmm0;
+	RegMMST	fpu_stmm1;
+	RegMMST	fpu_stmm2;
+	RegMMST	fpu_stmm3;
+	RegMMST	fpu_stmm4;
+	RegMMST	fpu_stmm5;
+	RegMMST	fpu_stmm6;
+	RegMMST	fpu_stmm7;
+	RegXMM	fpu_xmm0;
+	RegXMM	fpu_xmm1;
+	RegXMM	fpu_xmm2;
+	RegXMM	fpu_xmm3;
+	RegXMM	fpu_xmm4;
+	RegXMM	fpu_xmm5;
+	RegXMM	fpu_xmm6;
+	RegXMM	fpu_xmm7;
+	int8	fpu_rsrv4[224];
+	int32	fpu_reserved1;
+};
+struct ExceptionState32 {
+	uint16	trapno;
+	uint16	cpu;
+	uint32	err;
+	uint32	faultvaddr;
+};
+struct Mcontext32 {
+	ExceptionState32	es;
+	Regs32	ss;
+	FloatState32	fs;
+};
+
+struct Ucontext {
+	int32	uc_onstack;
+	uint32	uc_sigmask;
+	StackT	uc_stack;
+	Ucontext	*uc_link;
+	uint64	uc_mcsize;
+	Mcontext64	*uc_mcontext;
+};
+
+struct KeventT {
+	uint64	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_dragonfly.go b/src/runtime/defs_dragonfly.go
new file mode 100644
index 000000000..555b8f595
--- /dev/null
+++ b/src/runtime/defs_dragonfly.go
@@ -0,0 +1,126 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_dragonfly.go >defs_dragonfly_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_dragonfly.go >defs_dragonfly_386.h
+*/
+
+package runtime
+
+/*
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/event.h>
+#include <sys/mman.h>
+#include <sys/ucontext.h>
+#include <sys/rtprio.h>
+#include <sys/signal.h>
+#include <sys/unistd.h>
+#include <errno.h>
+#include <signal.h>
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EFAULT = C.EFAULT
+	EBUSY  = C.EBUSY
+	EAGAIN = C.EAGAIN
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANON
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_FREE = C.MADV_FREE
+
+	SA_SIGINFO = C.SA_SIGINFO
+	SA_RESTART = C.SA_RESTART
+	SA_ONSTACK = C.SA_ONSTACK
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGEMT    = C.SIGEMT
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGBUS    = C.SIGBUS
+	SIGSEGV   = C.SIGSEGV
+	SIGSYS    = C.SIGSYS
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGTERM   = C.SIGTERM
+	SIGURG    = C.SIGURG
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGCONT   = C.SIGCONT
+	SIGCHLD   = C.SIGCHLD
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGIO     = C.SIGIO
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGINFO   = C.SIGINFO
+	SIGUSR1   = C.SIGUSR1
+	SIGUSR2   = C.SIGUSR2
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	EV_ADD       = C.EV_ADD
+	EV_DELETE    = C.EV_DELETE
+	EV_CLEAR     = C.EV_CLEAR
+	EV_ERROR     = C.EV_ERROR
+	EVFILT_READ  = C.EVFILT_READ
+	EVFILT_WRITE = C.EVFILT_WRITE
+)
+
+type Rtprio C.struct_rtprio
+type Lwpparams C.struct_lwp_params
+type SigaltstackT C.struct_sigaltstack
+type Sigset C.struct___sigset
+type StackT C.stack_t
+
+type Siginfo C.siginfo_t
+
+type Mcontext C.mcontext_t
+type Ucontext C.ucontext_t
+
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+
+type Kevent C.struct_kevent
diff --git a/src/runtime/defs_dragonfly_386.h b/src/runtime/defs_dragonfly_386.h
new file mode 100644
index 000000000..f86b9c6b9
--- /dev/null
+++ b/src/runtime/defs_dragonfly_386.h
@@ -0,0 +1,198 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_dragonfly.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+	EBUSY	= 0x10,
+	EAGAIN	= 0x23,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct Rtprio Rtprio;
+typedef struct Lwpparams Lwpparams;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct Rtprio {
+	uint16	type;
+	uint16	prio;
+};
+struct Lwpparams {
+	void	*func;
+	byte	*arg;
+	byte	*stack;
+	int32	*tid1;
+	int32	*tid2;
+};
+struct SigaltstackT {
+	int8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct StackT {
+	int8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[4];
+	int32	si_band;
+	int32	__spare__[7];
+};
+
+struct Mcontext {
+	int32	mc_onstack;
+	int32	mc_gs;
+	int32	mc_fs;
+	int32	mc_es;
+	int32	mc_ds;
+	int32	mc_edi;
+	int32	mc_esi;
+	int32	mc_ebp;
+	int32	mc_isp;
+	int32	mc_ebx;
+	int32	mc_edx;
+	int32	mc_ecx;
+	int32	mc_eax;
+	int32	mc_xflags;
+	int32	mc_trapno;
+	int32	mc_err;
+	int32	mc_eip;
+	int32	mc_cs;
+	int32	mc_eflags;
+	int32	mc_esp;
+	int32	mc_ss;
+	int32	mc_len;
+	int32	mc_fpformat;
+	int32	mc_ownedfp;
+	int32	mc_fpregs[128];
+	int32	__spare__[16];
+};
+struct Ucontext {
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	__spare__[8];
+};
+
+struct Timespec {
+	int32	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int32	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint32	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int32	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_dragonfly_amd64.h b/src/runtime/defs_dragonfly_amd64.h
new file mode 100644
index 000000000..671555241
--- /dev/null
+++ b/src/runtime/defs_dragonfly_amd64.h
@@ -0,0 +1,208 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_dragonfly.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+	EBUSY	= 0x10,
+	EAGAIN	= 0x23,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct Rtprio Rtprio;
+typedef struct Lwpparams Lwpparams;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct Rtprio {
+	uint16	type;
+	uint16	prio;
+};
+struct Lwpparams {
+	void	*func;
+	byte	*arg;
+	byte	*stack;
+	int32	*tid1;
+	int32	*tid2;
+};
+struct SigaltstackT {
+	int8	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct StackT {
+	int8	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[8];
+	int64	si_band;
+	int32	__spare__[7];
+	byte	Pad_cgo_0[4];
+};
+
+struct Mcontext {
+	int64	mc_onstack;
+	int64	mc_rdi;
+	int64	mc_rsi;
+	int64	mc_rdx;
+	int64	mc_rcx;
+	int64	mc_r8;
+	int64	mc_r9;
+	int64	mc_rax;
+	int64	mc_rbx;
+	int64	mc_rbp;
+	int64	mc_r10;
+	int64	mc_r11;
+	int64	mc_r12;
+	int64	mc_r13;
+	int64	mc_r14;
+	int64	mc_r15;
+	int64	mc_xflags;
+	int64	mc_trapno;
+	int64	mc_addr;
+	int64	mc_flags;
+	int64	mc_err;
+	int64	mc_rip;
+	int64	mc_cs;
+	int64	mc_rflags;
+	int64	mc_rsp;
+	int64	mc_ss;
+	uint32	mc_len;
+	uint32	mc_fpformat;
+	uint32	mc_ownedfp;
+	uint32	mc_reserved;
+	uint32	mc_unused[8];
+	int32	mc_fpregs[256];
+};
+struct Ucontext {
+	Sigset	uc_sigmask;
+	byte	Pad_cgo_0[48];
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	__spare__[8];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int64	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint64	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_freebsd.go b/src/runtime/defs_freebsd.go
new file mode 100644
index 000000000..0253685aa
--- /dev/null
+++ b/src/runtime/defs_freebsd.go
@@ -0,0 +1,133 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_freebsd.go >defs_freebsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_freebsd.go >defs_freebsd_386.h
+GOARCH=arm go tool cgo -cdefs defs_freebsd.go >defs_freebsd_arm.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/event.h>
+#include <sys/mman.h>
+#include <sys/ucontext.h>
+#include <sys/umtx.h>
+#include <sys/rtprio.h>
+#include <sys/thr.h>
+#include <sys/_sigset.h>
+#include <sys/unistd.h>
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EFAULT = C.EFAULT
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANON
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_FREE = C.MADV_FREE
+
+	SA_SIGINFO = C.SA_SIGINFO
+	SA_RESTART = C.SA_RESTART
+	SA_ONSTACK = C.SA_ONSTACK
+
+	UMTX_OP_WAIT_UINT         = C.UMTX_OP_WAIT_UINT
+	UMTX_OP_WAIT_UINT_PRIVATE = C.UMTX_OP_WAIT_UINT_PRIVATE
+	UMTX_OP_WAKE              = C.UMTX_OP_WAKE
+	UMTX_OP_WAKE_PRIVATE      = C.UMTX_OP_WAKE_PRIVATE
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGEMT    = C.SIGEMT
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGBUS    = C.SIGBUS
+	SIGSEGV   = C.SIGSEGV
+	SIGSYS    = C.SIGSYS
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGTERM   = C.SIGTERM
+	SIGURG    = C.SIGURG
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGCONT   = C.SIGCONT
+	SIGCHLD   = C.SIGCHLD
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGIO     = C.SIGIO
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGINFO   = C.SIGINFO
+	SIGUSR1   = C.SIGUSR1
+	SIGUSR2   = C.SIGUSR2
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	EV_ADD       = C.EV_ADD
+	EV_DELETE    = C.EV_DELETE
+	EV_CLEAR     = C.EV_CLEAR
+	EV_RECEIPT   = C.EV_RECEIPT
+	EV_ERROR     = C.EV_ERROR
+	EVFILT_READ  = C.EVFILT_READ
+	EVFILT_WRITE = C.EVFILT_WRITE
+)
+
+type Rtprio C.struct_rtprio
+type ThrParam C.struct_thr_param
+type SigaltstackT C.struct_sigaltstack
+type Sigset C.struct___sigset
+type StackT C.stack_t
+
+type Siginfo C.siginfo_t
+
+type Mcontext C.mcontext_t
+type Ucontext C.ucontext_t
+
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+
+type Kevent C.struct_kevent
diff --git a/src/runtime/defs_freebsd_386.h b/src/runtime/defs_freebsd_386.h
new file mode 100644
index 000000000..156dccba4
--- /dev/null
+++ b/src/runtime/defs_freebsd_386.h
@@ -0,0 +1,213 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_freebsd.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	UMTX_OP_WAIT_UINT		= 0xb,
+	UMTX_OP_WAIT_UINT_PRIVATE	= 0xf,
+	UMTX_OP_WAKE			= 0x3,
+	UMTX_OP_WAKE_PRIVATE		= 0x10,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0x40,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct Rtprio Rtprio;
+typedef struct ThrParam ThrParam;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct Rtprio {
+	uint16	type;
+	uint16	prio;
+};
+struct ThrParam {
+	void	*start_func;
+	byte	*arg;
+	int8	*stack_base;
+	uint32	stack_size;
+	int8	*tls_base;
+	uint32	tls_size;
+	int32	*child_tid;
+	int32	*parent_tid;
+	int32	flags;
+	Rtprio	*rtp;
+	void	*spare[3];
+};
+struct SigaltstackT {
+	int8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct StackT {
+	int8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[4];
+	byte	_reason[32];
+};
+
+struct Mcontext {
+	int32	mc_onstack;
+	int32	mc_gs;
+	int32	mc_fs;
+	int32	mc_es;
+	int32	mc_ds;
+	int32	mc_edi;
+	int32	mc_esi;
+	int32	mc_ebp;
+	int32	mc_isp;
+	int32	mc_ebx;
+	int32	mc_edx;
+	int32	mc_ecx;
+	int32	mc_eax;
+	int32	mc_trapno;
+	int32	mc_err;
+	int32	mc_eip;
+	int32	mc_cs;
+	int32	mc_eflags;
+	int32	mc_esp;
+	int32	mc_ss;
+	int32	mc_len;
+	int32	mc_fpformat;
+	int32	mc_ownedfp;
+	int32	mc_flags;
+	int32	mc_fpstate[128];
+	int32	mc_fsbase;
+	int32	mc_gsbase;
+	int32	mc_xfpustate;
+	int32	mc_xfpustate_len;
+	int32	mc_spare2[4];
+};
+struct Ucontext {
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	uc_flags;
+	int32	__spare__[4];
+	byte	Pad_cgo_0[12];
+};
+
+struct Timespec {
+	int32	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int32	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint32	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int32	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_freebsd_amd64.h b/src/runtime/defs_freebsd_amd64.h
new file mode 100644
index 000000000..4ba8956a2
--- /dev/null
+++ b/src/runtime/defs_freebsd_amd64.h
@@ -0,0 +1,224 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_freebsd.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	UMTX_OP_WAIT_UINT		= 0xb,
+	UMTX_OP_WAIT_UINT_PRIVATE	= 0xf,
+	UMTX_OP_WAKE			= 0x3,
+	UMTX_OP_WAKE_PRIVATE		= 0x10,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0x40,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct Rtprio Rtprio;
+typedef struct ThrParam ThrParam;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct Rtprio {
+	uint16	type;
+	uint16	prio;
+};
+struct ThrParam {
+	void	*start_func;
+	byte	*arg;
+	int8	*stack_base;
+	uint64	stack_size;
+	int8	*tls_base;
+	uint64	tls_size;
+	int64	*child_tid;
+	int64	*parent_tid;
+	int32	flags;
+	byte	Pad_cgo_0[4];
+	Rtprio	*rtp;
+	void	*spare[3];
+};
+struct SigaltstackT {
+	int8	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct StackT {
+	int8	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[8];
+	byte	_reason[40];
+};
+
+struct Mcontext {
+	int64	mc_onstack;
+	int64	mc_rdi;
+	int64	mc_rsi;
+	int64	mc_rdx;
+	int64	mc_rcx;
+	int64	mc_r8;
+	int64	mc_r9;
+	int64	mc_rax;
+	int64	mc_rbx;
+	int64	mc_rbp;
+	int64	mc_r10;
+	int64	mc_r11;
+	int64	mc_r12;
+	int64	mc_r13;
+	int64	mc_r14;
+	int64	mc_r15;
+	uint32	mc_trapno;
+	uint16	mc_fs;
+	uint16	mc_gs;
+	int64	mc_addr;
+	uint32	mc_flags;
+	uint16	mc_es;
+	uint16	mc_ds;
+	int64	mc_err;
+	int64	mc_rip;
+	int64	mc_cs;
+	int64	mc_rflags;
+	int64	mc_rsp;
+	int64	mc_ss;
+	int64	mc_len;
+	int64	mc_fpformat;
+	int64	mc_ownedfp;
+	int64	mc_fpstate[64];
+	int64	mc_fsbase;
+	int64	mc_gsbase;
+	int64	mc_xfpustate;
+	int64	mc_xfpustate_len;
+	int64	mc_spare[4];
+};
+struct Ucontext {
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	uc_flags;
+	int32	__spare__[4];
+	byte	Pad_cgo_0[12];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int64	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint64	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_freebsd_arm.h b/src/runtime/defs_freebsd_arm.h
new file mode 100644
index 000000000..17deba68d
--- /dev/null
+++ b/src/runtime/defs_freebsd_arm.h
@@ -0,0 +1,186 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_freebsd.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	UMTX_OP_WAIT_UINT		= 0xb,
+	UMTX_OP_WAIT_UINT_PRIVATE	= 0xf,
+	UMTX_OP_WAKE			= 0x3,
+	UMTX_OP_WAKE_PRIVATE		= 0x10,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0x40,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct Rtprio Rtprio;
+typedef struct ThrParam ThrParam;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct Rtprio {
+	uint16	type;
+	uint16	prio;
+};
+struct ThrParam {
+	void	*start_func;
+	byte	*arg;
+	uint8	*stack_base;
+	uint32	stack_size;
+	uint8	*tls_base;
+	uint32	tls_size;
+	int32	*child_tid;
+	int32	*parent_tid;
+	int32	flags;
+	Rtprio	*rtp;
+	void	*spare[3];
+};
+struct SigaltstackT {
+	uint8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct StackT {
+	uint8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[4];
+	byte	_reason[32];
+};
+
+struct Mcontext {
+	uint32	__gregs[17];
+	byte	__fpu[140];
+};
+struct Ucontext {
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	uc_flags;
+	int32	__spare__[4];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int32	tv_nsec;
+	byte	Pad_cgo_0[4];
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+	byte	Pad_cgo_0[4];
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint32	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int32	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_linux.go b/src/runtime/defs_linux.go
new file mode 100644
index 000000000..8657dbb0e
--- /dev/null
+++ b/src/runtime/defs_linux.go
@@ -0,0 +1,124 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo -cdefs
+
+GOARCH=amd64 go tool cgo -cdefs defs_linux.go defs1_linux.go >defs_linux_amd64.h
+*/
+
+package runtime
+
+/*
+// Linux glibc and Linux kernel define different and conflicting
+// definitions for struct sigaction, struct timespec, etc.
+// We want the kernel ones, which are in the asm/* headers.
+// But then we'd get conflicts when we include the system
+// headers for things like ucontext_t, so that happens in
+// a separate file, defs1.go.
+
+#include <asm/posix_types.h>
+#define size_t __kernel_size_t
+#include <asm/signal.h>
+#include <asm/siginfo.h>
+#include <asm/mman.h>
+#include <asm-generic/errno.h>
+#include <asm-generic/poll.h>
+#include <linux/eventpoll.h>
+#undef size_t
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EAGAIN = C.EAGAIN
+	ENOMEM = C.ENOMEM
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANONYMOUS
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_DONTNEED = C.MADV_DONTNEED
+
+	SA_RESTART  = C.SA_RESTART
+	SA_ONSTACK  = C.SA_ONSTACK
+	SA_RESTORER = C.SA_RESTORER
+	SA_SIGINFO  = C.SA_SIGINFO
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGBUS    = C.SIGBUS
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGUSR1   = C.SIGUSR1
+	SIGSEGV   = C.SIGSEGV
+	SIGUSR2   = C.SIGUSR2
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGSTKFLT = C.SIGSTKFLT
+	SIGCHLD   = C.SIGCHLD
+	SIGCONT   = C.SIGCONT
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGURG    = C.SIGURG
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGIO     = C.SIGIO
+	SIGPWR    = C.SIGPWR
+	SIGSYS    = C.SIGSYS
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	EPOLLIN       = C.POLLIN
+	EPOLLOUT      = C.POLLOUT
+	EPOLLERR      = C.POLLERR
+	EPOLLHUP      = C.POLLHUP
+	EPOLLRDHUP    = C.POLLRDHUP
+	EPOLLET       = C.EPOLLET
+	EPOLL_CLOEXEC = C.EPOLL_CLOEXEC
+	EPOLL_CTL_ADD = C.EPOLL_CTL_ADD
+	EPOLL_CTL_DEL = C.EPOLL_CTL_DEL
+	EPOLL_CTL_MOD = C.EPOLL_CTL_MOD
+)
+
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Sigaction C.struct_sigaction
+type Siginfo C.siginfo_t
+type Itimerval C.struct_itimerval
+type EpollEvent C.struct_epoll_event
diff --git a/src/runtime/defs_linux_386.h b/src/runtime/defs_linux_386.h
new file mode 100644
index 000000000..24a05d862
--- /dev/null
+++ b/src/runtime/defs_linux_386.h
@@ -0,0 +1,211 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs2_linux.go
+
+
+enum {
+	EINTR	= 0x4,
+	EAGAIN	= 0xb,
+	ENOMEM	= 0xc,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x20,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_DONTNEED	= 0x4,
+
+	SA_RESTART	= 0x10000000,
+	SA_ONSTACK	= 0x8000000,
+	SA_RESTORER	= 0x4000000,
+	SA_SIGINFO	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGBUS		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGUSR1		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGUSR2		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGSTKFLT	= 0x10,
+	SIGCHLD		= 0x11,
+	SIGCONT		= 0x12,
+	SIGSTOP		= 0x13,
+	SIGTSTP		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGURG		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGIO		= 0x1d,
+	SIGPWR		= 0x1e,
+	SIGSYS		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	O_RDONLY	= 0x0,
+	O_CLOEXEC	= 0x80000,
+
+	EPOLLIN		= 0x1,
+	EPOLLOUT	= 0x4,
+	EPOLLERR	= 0x8,
+	EPOLLHUP	= 0x10,
+	EPOLLRDHUP	= 0x2000,
+	EPOLLET		= -0x80000000,
+	EPOLL_CLOEXEC	= 0x80000,
+	EPOLL_CTL_ADD	= 0x1,
+	EPOLL_CTL_DEL	= 0x2,
+	EPOLL_CTL_MOD	= 0x3,
+};
+
+typedef struct Fpreg Fpreg;
+typedef struct Fpxreg Fpxreg;
+typedef struct Xmmreg Xmmreg;
+typedef struct Fpstate Fpstate;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct SigactionT SigactionT;
+typedef struct Siginfo Siginfo;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigcontext Sigcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Itimerval Itimerval;
+typedef struct EpollEvent EpollEvent;
+
+#pragma pack on
+
+struct Fpreg {
+	uint16	significand[4];
+	uint16	exponent;
+};
+struct Fpxreg {
+	uint16	significand[4];
+	uint16	exponent;
+	uint16	padding[3];
+};
+struct Xmmreg {
+	uint32	element[4];
+};
+struct Fpstate {
+	uint32	cw;
+	uint32	sw;
+	uint32	tag;
+	uint32	ipoff;
+	uint32	cssel;
+	uint32	dataoff;
+	uint32	datasel;
+	Fpreg	_st[8];
+	uint16	status;
+	uint16	magic;
+	uint32	_fxsr_env[6];
+	uint32	mxcsr;
+	uint32	reserved;
+	Fpxreg	_fxsr_st[8];
+	Xmmreg	_xmm[8];
+	uint32	padding1[44];
+	byte	anon0[48];
+};
+struct Timespec {
+	int32	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int32	tv_sec;
+	int32	tv_usec;
+};
+struct SigactionT {
+	void	*k_sa_handler;
+	uint32	sa_flags;
+	void	*sa_restorer;
+	uint64	sa_mask;
+};
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	byte	_sifields[116];
+};
+struct SigaltstackT {
+	byte	*ss_sp;
+	int32	ss_flags;
+	uint32	ss_size;
+};
+struct Sigcontext {
+	uint16	gs;
+	uint16	__gsh;
+	uint16	fs;
+	uint16	__fsh;
+	uint16	es;
+	uint16	__esh;
+	uint16	ds;
+	uint16	__dsh;
+	uint32	edi;
+	uint32	esi;
+	uint32	ebp;
+	uint32	esp;
+	uint32	ebx;
+	uint32	edx;
+	uint32	ecx;
+	uint32	eax;
+	uint32	trapno;
+	uint32	err;
+	uint32	eip;
+	uint16	cs;
+	uint16	__csh;
+	uint32	eflags;
+	uint32	esp_at_signal;
+	uint16	ss;
+	uint16	__ssh;
+	Fpstate	*fpstate;
+	uint32	oldmask;
+	uint32	cr2;
+};
+struct Ucontext {
+	uint32	uc_flags;
+	Ucontext	*uc_link;
+	SigaltstackT	uc_stack;
+	Sigcontext	uc_mcontext;
+	uint32	uc_sigmask;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+struct EpollEvent {
+	uint32	events;
+	byte	data[8]; // to match amd64
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_linux_amd64.h b/src/runtime/defs_linux_amd64.h
new file mode 100644
index 000000000..14616dffe
--- /dev/null
+++ b/src/runtime/defs_linux_amd64.h
@@ -0,0 +1,254 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_linux.go defs1_linux.go
+
+
+enum {
+	EINTR	= 0x4,
+	EAGAIN	= 0xb,
+	ENOMEM	= 0xc,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x20,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_DONTNEED	= 0x4,
+
+	SA_RESTART	= 0x10000000,
+	SA_ONSTACK	= 0x8000000,
+	SA_RESTORER	= 0x4000000,
+	SA_SIGINFO	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGBUS		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGUSR1		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGUSR2		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGSTKFLT	= 0x10,
+	SIGCHLD		= 0x11,
+	SIGCONT		= 0x12,
+	SIGSTOP		= 0x13,
+	SIGTSTP		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGURG		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGIO		= 0x1d,
+	SIGPWR		= 0x1e,
+	SIGSYS		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EPOLLIN		= 0x1,
+	EPOLLOUT	= 0x4,
+	EPOLLERR	= 0x8,
+	EPOLLHUP	= 0x10,
+	EPOLLRDHUP	= 0x2000,
+	EPOLLET		= -0x80000000,
+	EPOLL_CLOEXEC	= 0x80000,
+	EPOLL_CTL_ADD	= 0x1,
+	EPOLL_CTL_DEL	= 0x2,
+	EPOLL_CTL_MOD	= 0x3,
+};
+
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct SigactionT SigactionT;
+typedef struct Siginfo Siginfo;
+typedef struct Itimerval Itimerval;
+typedef struct EpollEvent EpollEvent;
+
+#pragma pack on
+
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int64	tv_usec;
+};
+struct SigactionT {
+	void	*sa_handler;
+	uint64	sa_flags;
+	void	*sa_restorer;
+	uint64	sa_mask;
+};
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	byte	Pad_cgo_0[4];
+	byte	_sifields[112];
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+struct EpollEvent {
+	uint32	events;
+	byte	data[8]; // unaligned uintptr
+};
+
+
+#pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_linux.go defs1_linux.go
+
+
+enum {
+	O_RDONLY	= 0x0,
+	O_CLOEXEC	= 0x80000,
+};
+
+typedef struct Usigset Usigset;
+typedef struct Fpxreg Fpxreg;
+typedef struct Xmmreg Xmmreg;
+typedef struct Fpstate Fpstate;
+typedef struct Fpxreg1 Fpxreg1;
+typedef struct Xmmreg1 Xmmreg1;
+typedef struct Fpstate1 Fpstate1;
+typedef struct Fpreg1 Fpreg1;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Sigcontext Sigcontext;
+
+#pragma pack on
+
+struct Usigset {
+	uint64	__val[16];
+};
+struct Fpxreg {
+	uint16	significand[4];
+	uint16	exponent;
+	uint16	padding[3];
+};
+struct Xmmreg {
+	uint32	element[4];
+};
+struct Fpstate {
+	uint16	cwd;
+	uint16	swd;
+	uint16	ftw;
+	uint16	fop;
+	uint64	rip;
+	uint64	rdp;
+	uint32	mxcsr;
+	uint32	mxcr_mask;
+	Fpxreg	_st[8];
+	Xmmreg	_xmm[16];
+	uint32	padding[24];
+};
+struct Fpxreg1 {
+	uint16	significand[4];
+	uint16	exponent;
+	uint16	padding[3];
+};
+struct Xmmreg1 {
+	uint32	element[4];
+};
+struct Fpstate1 {
+	uint16	cwd;
+	uint16	swd;
+	uint16	ftw;
+	uint16	fop;
+	uint64	rip;
+	uint64	rdp;
+	uint32	mxcsr;
+	uint32	mxcr_mask;
+	Fpxreg1	_st[8];
+	Xmmreg1	_xmm[16];
+	uint32	padding[24];
+};
+struct Fpreg1 {
+	uint16	significand[4];
+	uint16	exponent;
+};
+struct SigaltstackT {
+	byte	*ss_sp;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+	uint64	ss_size;
+};
+struct Mcontext {
+	int64	gregs[23];
+	Fpstate	*fpregs;
+	uint64	__reserved1[8];
+};
+struct Ucontext {
+	uint64	uc_flags;
+	Ucontext	*uc_link;
+	SigaltstackT	uc_stack;
+	Mcontext	uc_mcontext;
+	Usigset	uc_sigmask;
+	Fpstate	__fpregs_mem;
+};
+struct Sigcontext {
+	uint64	r8;
+	uint64	r9;
+	uint64	r10;
+	uint64	r11;
+	uint64	r12;
+	uint64	r13;
+	uint64	r14;
+	uint64	r15;
+	uint64	rdi;
+	uint64	rsi;
+	uint64	rbp;
+	uint64	rbx;
+	uint64	rdx;
+	uint64	rax;
+	uint64	rcx;
+	uint64	rsp;
+	uint64	rip;
+	uint64	eflags;
+	uint16	cs;
+	uint16	gs;
+	uint16	fs;
+	uint16	__pad0;
+	uint64	err;
+	uint64	trapno;
+	uint64	oldmask;
+	uint64	cr2;
+	Fpstate1	*fpstate;
+	uint64	__reserved1[8];
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_linux_arm.h b/src/runtime/defs_linux_arm.h
new file mode 100644
index 000000000..50b3c919e
--- /dev/null
+++ b/src/runtime/defs_linux_arm.h
@@ -0,0 +1,168 @@
+// TODO: Generate using cgo like defs_linux_{386,amd64}.h
+
+// Constants
+enum {
+	EINTR  = 0x4,
+	ENOMEM = 0xc,
+	EAGAIN = 0xb,
+
+	PROT_NONE = 0,
+	PROT_READ = 0x1,
+	PROT_WRITE = 0x2,
+	PROT_EXEC = 0x4,
+	MAP_ANON = 0x20,
+	MAP_PRIVATE = 0x2,
+	MAP_FIXED = 0x10,
+	MADV_DONTNEED = 0x4,
+	SA_RESTART = 0x10000000,
+	SA_ONSTACK = 0x8000000,
+	SA_RESTORER = 0, // unused on ARM
+	SA_SIGINFO = 0x4,
+	SIGHUP = 0x1,
+	SIGINT = 0x2,
+	SIGQUIT = 0x3,
+	SIGILL = 0x4,
+	SIGTRAP = 0x5,
+	SIGABRT = 0x6,
+	SIGBUS = 0x7,
+	SIGFPE = 0x8,
+	SIGKILL = 0x9,
+	SIGUSR1 = 0xa,
+	SIGSEGV = 0xb,
+	SIGUSR2 = 0xc,
+	SIGPIPE = 0xd,
+	SIGALRM = 0xe,
+	SIGSTKFLT = 0x10,
+	SIGCHLD = 0x11,
+	SIGCONT = 0x12,
+	SIGSTOP = 0x13,
+	SIGTSTP = 0x14,
+	SIGTTIN = 0x15,
+	SIGTTOU = 0x16,
+	SIGURG = 0x17,
+	SIGXCPU = 0x18,
+	SIGXFSZ = 0x19,
+	SIGVTALRM = 0x1a,
+	SIGPROF = 0x1b,
+	SIGWINCH = 0x1c,
+	SIGIO = 0x1d,
+	SIGPWR = 0x1e,
+	SIGSYS = 0x1f,
+	FPE_INTDIV = 0x1,
+	FPE_INTOVF = 0x2,
+	FPE_FLTDIV = 0x3,
+	FPE_FLTOVF = 0x4,
+	FPE_FLTUND = 0x5,
+	FPE_FLTRES = 0x6,
+	FPE_FLTINV = 0x7,
+	FPE_FLTSUB = 0x8,
+	BUS_ADRALN = 0x1,
+	BUS_ADRERR = 0x2,
+	BUS_OBJERR = 0x3,
+	SEGV_MAPERR = 0x1,
+	SEGV_ACCERR = 0x2,
+	ITIMER_REAL = 0,
+	ITIMER_PROF = 0x2,
+	ITIMER_VIRTUAL = 0x1,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
+
+	EPOLLIN		= 0x1,
+	EPOLLOUT	= 0x4,
+	EPOLLERR	= 0x8,
+	EPOLLHUP	= 0x10,
+	EPOLLRDHUP	= 0x2000,
+	EPOLLET		= -0x80000000,
+	EPOLL_CLOEXEC	= 0x80000,
+	EPOLL_CTL_ADD	= 0x1,
+	EPOLL_CTL_DEL	= 0x2,
+	EPOLL_CTL_MOD	= 0x3,
+};
+
+// Types
+#pragma pack on
+
+typedef struct Timespec Timespec;
+struct Timespec {
+	int32 tv_sec;
+	int32 tv_nsec;
+};
+
+typedef struct SigaltstackT SigaltstackT;
+struct SigaltstackT {
+	void *ss_sp;
+	int32 ss_flags;
+	uint32 ss_size;
+};
+
+typedef struct Sigcontext Sigcontext;
+struct Sigcontext {
+	uint32 trap_no;
+	uint32 error_code;
+	uint32 oldmask;
+	uint32 arm_r0;
+	uint32 arm_r1;
+	uint32 arm_r2;
+	uint32 arm_r3;
+	uint32 arm_r4;
+	uint32 arm_r5;
+	uint32 arm_r6;
+	uint32 arm_r7;
+	uint32 arm_r8;
+	uint32 arm_r9;
+	uint32 arm_r10;
+	uint32 arm_fp;
+	uint32 arm_ip;
+	uint32 arm_sp;
+	uint32 arm_lr;
+	uint32 arm_pc;
+	uint32 arm_cpsr;
+	uint32 fault_address;
+};
+
+typedef struct Ucontext Ucontext;
+struct Ucontext {
+	uint32 uc_flags;
+	Ucontext *uc_link;
+	SigaltstackT uc_stack;
+	Sigcontext uc_mcontext;
+	uint32 uc_sigmask;
+	int32 __unused[31];
+	uint32 uc_regspace[128];
+};
+
+typedef struct Timeval Timeval;
+struct Timeval {
+	int32 tv_sec;
+	int32 tv_usec;
+};
+
+typedef struct Itimerval Itimerval;
+struct Itimerval {
+	Timeval it_interval;
+	Timeval it_value;
+};
+
+typedef struct Siginfo Siginfo;
+struct Siginfo {
+	int32 si_signo;
+	int32 si_errno;
+	int32 si_code;
+	uint8 _sifields[4];
+};
+
+typedef struct SigactionT SigactionT;
+struct SigactionT {
+	void *sa_handler;
+	uint32 sa_flags;
+	void *sa_restorer;
+	uint64 sa_mask;
+};
+
+typedef struct EpollEvent EpollEvent;
+struct EpollEvent {
+	uint32	events;
+	uint32	_pad;
+	byte	data[8]; // to match amd64
+};
+#pragma pack off
diff --git a/src/runtime/defs_nacl_386.h b/src/runtime/defs_nacl_386.h
new file mode 100644
index 000000000..e8fbb38e1
--- /dev/null
+++ b/src/runtime/defs_nacl_386.h
@@ -0,0 +1,63 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Created by hand, not machine generated.
+
+enum
+{
+	// These values are referred to in the source code
+	// but really don't matter. Even so, use the standard numbers.
+	SIGSEGV = 11,
+	SIGPROF = 27,
+};
+
+typedef struct Siginfo Siginfo;
+
+// native_client/src/trusted/service_runtime/include/machine/_types.h
+typedef struct Timespec Timespec;
+
+struct Timespec
+{
+	int64 tv_sec;
+	int32 tv_nsec;
+};
+
+// native_client/src/trusted/service_runtime/nacl_exception.h
+// native_client/src/include/nacl/nacl_exception.h
+
+typedef struct ExcContext ExcContext;
+typedef struct ExcPortable ExcPortable;
+typedef struct ExcRegs386 ExcRegs386;
+
+struct ExcRegs386
+{
+	uint32	eax;
+	uint32	ecx;
+	uint32	edx;
+	uint32	ebx;
+	uint32	esp;
+	uint32	ebp;
+	uint32	esi;
+	uint32	edi;
+	uint32	eip;
+	uint32	eflags;
+};
+
+struct ExcContext
+{
+	uint32	size;
+	uint32	portable_context_offset;
+	uint32	portable_context_size;
+	uint32	arch;
+	uint32	regs_size;
+	uint32	reserved[11];
+	ExcRegs386	regs;
+};
+
+struct ExcPortableContext
+{
+	uint32	pc;
+	uint32	sp;
+	uint32	fp;
+};
diff --git a/src/runtime/defs_nacl_amd64p32.h b/src/runtime/defs_nacl_amd64p32.h
new file mode 100644
index 000000000..45663d40a
--- /dev/null
+++ b/src/runtime/defs_nacl_amd64p32.h
@@ -0,0 +1,90 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Created by hand, not machine generated.
+
+enum
+{
+	// These values are referred to in the source code
+	// but really don't matter. Even so, use the standard numbers.
+	SIGSEGV = 11,
+	SIGPROF = 27,
+};
+
+typedef struct Siginfo Siginfo;
+
+
+// native_client/src/trusted/service_runtime/include/machine/_types.h
+typedef struct Timespec Timespec;
+
+struct Timespec
+{
+	int64 tv_sec;
+	int32 tv_nsec;
+};
+
+// native_client/src/trusted/service_runtime/nacl_exception.h
+// native_client/src/include/nacl/nacl_exception.h
+
+typedef struct ExcContext ExcContext;
+typedef struct ExcPortable ExcPortable;
+typedef struct ExcRegs386 ExcRegs386;
+typedef struct ExcRegsAmd64 ExcRegsAmd64;
+
+struct ExcRegs386
+{
+	uint32	eax;
+	uint32	ecx;
+	uint32	edx;
+	uint32	ebx;
+	uint32	esp;
+	uint32	ebp;
+	uint32	esi;
+	uint32	edi;
+	uint32	eip;
+	uint32	eflags;
+};
+
+struct ExcRegsAmd64
+{
+	uint64	rax;
+	uint64	rcx;
+	uint64	rdx;
+	uint64	rbx;
+	uint64	rsp;
+	uint64	rbp;
+	uint64	rsi;
+	uint64	rdi;
+	uint64	r8;
+	uint64	r9;
+	uint64	r10;
+	uint64	r11;
+	uint64	r12;
+	uint64	r13;
+	uint64	r14;
+	uint64	r15;
+	uint64	rip;
+	uint32	rflags;
+};
+
+struct ExcContext
+{
+	uint32	size;
+	uint32	portable_context_offset;
+	uint32	portable_context_size;
+	uint32	arch;
+	uint32	regs_size;
+	uint32	reserved[11];
+	union {
+		ExcRegs386	regs;
+		ExcRegsAmd64	regs64;
+	} regs;
+};
+
+struct ExcPortableContext
+{
+	uint32	pc;
+	uint32	sp;
+	uint32	fp;
+};
diff --git a/src/runtime/defs_nacl_arm.h b/src/runtime/defs_nacl_arm.h
new file mode 100644
index 000000000..9ce07ccb2
--- /dev/null
+++ b/src/runtime/defs_nacl_arm.h
@@ -0,0 +1,70 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Created by hand, not machine generated.
+
+enum
+{
+	// These values are referred to in the source code
+	// but really don't matter. Even so, use the standard numbers.
+	SIGSEGV = 11,
+	SIGPROF = 27,
+};
+
+typedef struct Siginfo Siginfo;
+
+// native_client/src/trusted/service_runtime/include/machine/_types.h
+typedef struct Timespec Timespec;
+
+struct Timespec
+{
+	int64 tv_sec;
+	int32 tv_nsec;
+};
+
+// native_client/src/trusted/service_runtime/nacl_exception.h
+// native_client/src/include/nacl/nacl_exception.h
+
+typedef struct ExcContext ExcContext;
+typedef struct ExcPortable ExcPortable;
+typedef struct ExcRegsARM ExcRegsARM;
+
+struct ExcRegsARM
+{
+	uint32	r0;
+	uint32	r1;
+	uint32	r2;
+	uint32	r3;
+	uint32	r4;
+	uint32	r5;
+	uint32	r6;
+	uint32	r7;
+	uint32	r8;
+	uint32	r9;	// the value reported here is undefined.
+	uint32	r10;
+	uint32	r11;
+	uint32	r12;
+	uint32	sp;	/* r13 */
+	uint32	lr;	/* r14 */
+	uint32	pc;	/* r15 */
+	uint32	cpsr;
+};
+
+struct ExcContext
+{
+	uint32	size;
+	uint32	portable_context_offset;
+	uint32	portable_context_size;
+	uint32	arch;
+	uint32	regs_size;
+	uint32	reserved[11];
+	ExcRegsARM	regs;
+};
+
+struct ExcPortableContext
+{
+	uint32	pc;
+	uint32	sp;
+	uint32	fp;
+};
diff --git a/src/runtime/defs_netbsd.go b/src/runtime/defs_netbsd.go
new file mode 100644
index 000000000..b27949e42
--- /dev/null
+++ b/src/runtime/defs_netbsd.go
@@ -0,0 +1,125 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go >defs_netbsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_netbsd.go defs_netbsd_386.go >defs_netbsd_386.h
+GOARCH=arm go tool cgo -cdefs defs_netbsd.go defs_netbsd_arm.go >defs_netbsd_arm.h
+*/
+
+// +godefs map __fpregset_t [644]byte
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/event.h>
+#include <sys/time.h>
+#include <sys/ucontext.h>
+#include <sys/unistd.h>
+#include <errno.h>
+#include <signal.h>
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EFAULT = C.EFAULT
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANON
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_FREE = C.MADV_FREE
+
+	SA_SIGINFO = C.SA_SIGINFO
+	SA_RESTART = C.SA_RESTART
+	SA_ONSTACK = C.SA_ONSTACK
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGEMT    = C.SIGEMT
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGBUS    = C.SIGBUS
+	SIGSEGV   = C.SIGSEGV
+	SIGSYS    = C.SIGSYS
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGTERM   = C.SIGTERM
+	SIGURG    = C.SIGURG
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGCONT   = C.SIGCONT
+	SIGCHLD   = C.SIGCHLD
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGIO     = C.SIGIO
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGINFO   = C.SIGINFO
+	SIGUSR1   = C.SIGUSR1
+	SIGUSR2   = C.SIGUSR2
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	EV_ADD       = C.EV_ADD
+	EV_DELETE    = C.EV_DELETE
+	EV_CLEAR     = C.EV_CLEAR
+	EV_RECEIPT   = 0
+	EV_ERROR     = C.EV_ERROR
+	EVFILT_READ  = C.EVFILT_READ
+	EVFILT_WRITE = C.EVFILT_WRITE
+)
+
+type SigaltstackT C.struct_sigaltstack
+type Sigset C.sigset_t
+type Siginfo C.struct__ksiginfo
+
+type StackT C.stack_t
+
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+
+type McontextT C.mcontext_t
+type UcontextT C.ucontext_t
+
+type Kevent C.struct_kevent
diff --git a/src/runtime/defs_netbsd_386.go b/src/runtime/defs_netbsd_386.go
new file mode 100644
index 000000000..c26f24607
--- /dev/null
+++ b/src/runtime/defs_netbsd_386.go
@@ -0,0 +1,41 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=386 go tool cgo -cdefs defs_netbsd.go defs_netbsd_386.go >defs_netbsd_386.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <machine/mcontext.h>
+*/
+import "C"
+
+const (
+	REG_GS     = C._REG_GS
+	REG_FS     = C._REG_FS
+	REG_ES     = C._REG_ES
+	REG_DS     = C._REG_DS
+	REG_EDI    = C._REG_EDI
+	REG_ESI    = C._REG_ESI
+	REG_EBP    = C._REG_EBP
+	REG_ESP    = C._REG_ESP
+	REG_EBX    = C._REG_EBX
+	REG_EDX    = C._REG_EDX
+	REG_ECX    = C._REG_ECX
+	REG_EAX    = C._REG_EAX
+	REG_TRAPNO = C._REG_TRAPNO
+	REG_ERR    = C._REG_ERR
+	REG_EIP    = C._REG_EIP
+	REG_CS     = C._REG_CS
+	REG_EFL    = C._REG_EFL
+	REG_UESP   = C._REG_UESP
+	REG_SS     = C._REG_SS
+)
diff --git a/src/runtime/defs_netbsd_386.h b/src/runtime/defs_netbsd_386.h
new file mode 100644
index 000000000..fd87804f9
--- /dev/null
+++ b/src/runtime/defs_netbsd_386.h
@@ -0,0 +1,182 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_386.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= 0x0,
+	EVFILT_WRITE	= 0x1,
+};
+
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct McontextT McontextT;
+typedef struct UcontextT UcontextT;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct SigaltstackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct Siginfo {
+	int32	_signo;
+	int32	_code;
+	int32	_errno;
+	byte	_reason[20];
+};
+
+struct StackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct McontextT {
+	int32	__gregs[19];
+	byte	__fpregs[644];
+	int32	_mc_tlsbase;
+};
+struct UcontextT {
+	uint32	uc_flags;
+	UcontextT	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	McontextT	uc_mcontext;
+	int32	__uc_pad[4];
+};
+
+struct KeventT {
+	uint32	ident;
+	uint32	filter;
+	uint32	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_386.go
+
+
+enum {
+	REG_GS		= 0x0,
+	REG_FS		= 0x1,
+	REG_ES		= 0x2,
+	REG_DS		= 0x3,
+	REG_EDI		= 0x4,
+	REG_ESI		= 0x5,
+	REG_EBP		= 0x6,
+	REG_ESP		= 0x7,
+	REG_EBX		= 0x8,
+	REG_EDX		= 0x9,
+	REG_ECX		= 0xa,
+	REG_EAX		= 0xb,
+	REG_TRAPNO	= 0xc,
+	REG_ERR		= 0xd,
+	REG_EIP		= 0xe,
+	REG_CS		= 0xf,
+	REG_EFL		= 0x10,
+	REG_UESP	= 0x11,
+	REG_SS		= 0x12,
+};
+
diff --git a/src/runtime/defs_netbsd_amd64.go b/src/runtime/defs_netbsd_amd64.go
new file mode 100644
index 000000000..f18a7b1fe
--- /dev/null
+++ b/src/runtime/defs_netbsd_amd64.go
@@ -0,0 +1,48 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go >defs_netbsd_amd64.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <machine/mcontext.h>
+*/
+import "C"
+
+const (
+	REG_RDI    = C._REG_RDI
+	REG_RSI    = C._REG_RSI
+	REG_RDX    = C._REG_RDX
+	REG_RCX    = C._REG_RCX
+	REG_R8     = C._REG_R8
+	REG_R9     = C._REG_R9
+	REG_R10    = C._REG_R10
+	REG_R11    = C._REG_R11
+	REG_R12    = C._REG_R12
+	REG_R13    = C._REG_R13
+	REG_R14    = C._REG_R14
+	REG_R15    = C._REG_R15
+	REG_RBP    = C._REG_RBP
+	REG_RBX    = C._REG_RBX
+	REG_RAX    = C._REG_RAX
+	REG_GS     = C._REG_GS
+	REG_FS     = C._REG_FS
+	REG_ES     = C._REG_ES
+	REG_DS     = C._REG_DS
+	REG_TRAPNO = C._REG_TRAPNO
+	REG_ERR    = C._REG_ERR
+	REG_RIP    = C._REG_RIP
+	REG_CS     = C._REG_CS
+	REG_RFLAGS = C._REG_RFLAGS
+	REG_RSP    = C._REG_RSP
+	REG_SS     = C._REG_SS
+)
diff --git a/src/runtime/defs_netbsd_amd64.h b/src/runtime/defs_netbsd_amd64.h
new file mode 100644
index 000000000..dac94b113
--- /dev/null
+++ b/src/runtime/defs_netbsd_amd64.h
@@ -0,0 +1,194 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= 0x0,
+	EVFILT_WRITE	= 0x1,
+};
+
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct McontextT McontextT;
+typedef struct UcontextT UcontextT;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct SigaltstackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct Siginfo {
+	int32	_signo;
+	int32	_code;
+	int32	_errno;
+	int32	_pad;
+	byte	_reason[24];
+};
+
+struct StackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+	byte	Pad_cgo_0[4];
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct McontextT {
+	uint64	__gregs[26];
+	uint64	_mc_tlsbase;
+	int8	__fpregs[512];
+};
+struct UcontextT {
+	uint32	uc_flags;
+	byte	Pad_cgo_0[4];
+	UcontextT	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	McontextT	uc_mcontext;
+};
+
+struct KeventT {
+	uint64	ident;
+	uint32	filter;
+	uint32	flags;
+	uint32	fflags;
+	byte	Pad_cgo_0[4];
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go
+
+
+enum {
+	REG_RDI		= 0x0,
+	REG_RSI		= 0x1,
+	REG_RDX		= 0x2,
+	REG_RCX		= 0x3,
+	REG_R8		= 0x4,
+	REG_R9		= 0x5,
+	REG_R10		= 0x6,
+	REG_R11		= 0x7,
+	REG_R12		= 0x8,
+	REG_R13		= 0x9,
+	REG_R14		= 0xa,
+	REG_R15		= 0xb,
+	REG_RBP		= 0xc,
+	REG_RBX		= 0xd,
+	REG_RAX		= 0xe,
+	REG_GS		= 0xf,
+	REG_FS		= 0x10,
+	REG_ES		= 0x11,
+	REG_DS		= 0x12,
+	REG_TRAPNO	= 0x13,
+	REG_ERR		= 0x14,
+	REG_RIP		= 0x15,
+	REG_CS		= 0x16,
+	REG_RFLAGS	= 0x17,
+	REG_RSP		= 0x18,
+	REG_SS		= 0x19,
+};
+
diff --git a/src/runtime/defs_netbsd_arm.go b/src/runtime/defs_netbsd_arm.go
new file mode 100644
index 000000000..cb0dce66b
--- /dev/null
+++ b/src/runtime/defs_netbsd_arm.go
@@ -0,0 +1,39 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=arm go tool cgo -cdefs defs_netbsd.go defs_netbsd_arm.go >defs_netbsd_arm.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <machine/mcontext.h>
+*/
+import "C"
+
+const (
+	REG_R0   = C._REG_R0
+	REG_R1   = C._REG_R1
+	REG_R2   = C._REG_R2
+	REG_R3   = C._REG_R3
+	REG_R4   = C._REG_R4
+	REG_R5   = C._REG_R5
+	REG_R6   = C._REG_R6
+	REG_R7   = C._REG_R7
+	REG_R8   = C._REG_R8
+	REG_R9   = C._REG_R9
+	REG_R10  = C._REG_R10
+	REG_R11  = C._REG_R11
+	REG_R12  = C._REG_R12
+	REG_R13  = C._REG_R13
+	REG_R14  = C._REG_R14
+	REG_R15  = C._REG_R15
+	REG_CPSR = C._REG_CPSR
+)
diff --git a/src/runtime/defs_netbsd_arm.h b/src/runtime/defs_netbsd_arm.h
new file mode 100644
index 000000000..70f34af47
--- /dev/null
+++ b/src/runtime/defs_netbsd_arm.h
@@ -0,0 +1,184 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_arm.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_RECEIPT	= 0,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= 0x0,
+	EVFILT_WRITE	= 0x1,
+};
+
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct McontextT McontextT;
+typedef struct UcontextT UcontextT;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct SigaltstackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct Siginfo {
+	int32	_signo;
+	int32	_code;
+	int32	_errno;
+	byte	_reason[20];
+};
+
+struct StackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct McontextT {
+	uint32	__gregs[17];
+#ifdef __ARM_EABI__
+	byte	__fpu[4+8*32+4];
+#else
+	byte	__fpu[4+4*33+4];
+#endif
+	uint32	_mc_tlsbase;
+};
+struct UcontextT {
+	uint32	uc_flags;
+	UcontextT	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	McontextT	uc_mcontext;
+	int32	__uc_pad[2];
+};
+
+struct KeventT {
+	uint32	ident;
+	uint32	filter;
+	uint32	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_arm.go
+
+
+enum {
+	REG_R0		= 0x0,
+	REG_R1		= 0x1,
+	REG_R2		= 0x2,
+	REG_R3		= 0x3,
+	REG_R4		= 0x4,
+	REG_R5		= 0x5,
+	REG_R6		= 0x6,
+	REG_R7		= 0x7,
+	REG_R8		= 0x8,
+	REG_R9		= 0x9,
+	REG_R10		= 0xa,
+	REG_R11		= 0xb,
+	REG_R12		= 0xc,
+	REG_R13		= 0xd,
+	REG_R14		= 0xe,
+	REG_R15		= 0xf,
+	REG_CPSR	= 0x10,
+};
+
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
new file mode 100644
index 000000000..39224c988
--- /dev/null
+++ b/src/runtime/defs_openbsd.go
@@ -0,0 +1,121 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_openbsd.go >defs_openbsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_openbsd.go >defs_openbsd_386.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/unistd.h>
+#include <sys/signal.h>
+#include <errno.h>
+#include <signal.h>
+*/
+import "C"
+
+const (
+	EINTR  = C.EINTR
+	EFAULT = C.EFAULT
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANON
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_FREE = C.MADV_FREE
+
+	SA_SIGINFO = C.SA_SIGINFO
+	SA_RESTART = C.SA_RESTART
+	SA_ONSTACK = C.SA_ONSTACK
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGEMT    = C.SIGEMT
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGBUS    = C.SIGBUS
+	SIGSEGV   = C.SIGSEGV
+	SIGSYS    = C.SIGSYS
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGTERM   = C.SIGTERM
+	SIGURG    = C.SIGURG
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGCONT   = C.SIGCONT
+	SIGCHLD   = C.SIGCHLD
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGIO     = C.SIGIO
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGINFO   = C.SIGINFO
+	SIGUSR1   = C.SIGUSR1
+	SIGUSR2   = C.SIGUSR2
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	EV_ADD       = C.EV_ADD
+	EV_DELETE    = C.EV_DELETE
+	EV_CLEAR     = C.EV_CLEAR
+	EV_ERROR     = C.EV_ERROR
+	EVFILT_READ  = C.EVFILT_READ
+	EVFILT_WRITE = C.EVFILT_WRITE
+)
+
+type TforkT C.struct___tfork
+
+type SigaltstackT C.struct_sigaltstack
+type Sigcontext C.struct_sigcontext
+type Siginfo C.siginfo_t
+type Sigset C.sigset_t
+type Sigval C.union_sigval
+
+type StackT C.stack_t
+
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+
+type KeventT C.struct_kevent
diff --git a/src/runtime/defs_openbsd_386.h b/src/runtime/defs_openbsd_386.h
new file mode 100644
index 000000000..6b77e0084
--- /dev/null
+++ b/src/runtime/defs_openbsd_386.h
@@ -0,0 +1,168 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_openbsd.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct TforkT TforkT;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigcontext Sigcontext;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct TforkT {
+	byte	*tf_tcb;
+	int32	*tf_tid;
+	byte	*tf_stack;
+};
+
+struct SigaltstackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigcontext {
+	int32	sc_gs;
+	int32	sc_fs;
+	int32	sc_es;
+	int32	sc_ds;
+	int32	sc_edi;
+	int32	sc_esi;
+	int32	sc_ebp;
+	int32	sc_ebx;
+	int32	sc_edx;
+	int32	sc_ecx;
+	int32	sc_eax;
+	int32	sc_eip;
+	int32	sc_cs;
+	int32	sc_eflags;
+	int32	sc_esp;
+	int32	sc_ss;
+	int32	__sc_unused;
+	int32	sc_mask;
+	int32	sc_trapno;
+	int32	sc_err;
+	void	*sc_fpstate;
+};
+struct Siginfo {
+	int32	si_signo;
+	int32	si_code;
+	int32	si_errno;
+	byte	_data[116];
+};
+typedef	uint32	Sigset;
+typedef	byte	Sigval[4];
+
+struct StackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint32	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_openbsd_amd64.h b/src/runtime/defs_openbsd_amd64.h
new file mode 100644
index 000000000..761e8e47d
--- /dev/null
+++ b/src/runtime/defs_openbsd_amd64.h
@@ -0,0 +1,179 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_openbsd.go
+
+
+enum {
+	EINTR	= 0x4,
+	EFAULT	= 0xe,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	EV_ADD		= 0x1,
+	EV_DELETE	= 0x2,
+	EV_CLEAR	= 0x20,
+	EV_ERROR	= 0x4000,
+	EVFILT_READ	= -0x1,
+	EVFILT_WRITE	= -0x2,
+};
+
+typedef struct TforkT TforkT;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigcontext Sigcontext;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct KeventT KeventT;
+
+#pragma pack on
+
+struct TforkT {
+	byte	*tf_tcb;
+	int32	*tf_tid;
+	byte	*tf_stack;
+};
+
+struct SigaltstackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+struct Sigcontext {
+	int64	sc_rdi;
+	int64	sc_rsi;
+	int64	sc_rdx;
+	int64	sc_rcx;
+	int64	sc_r8;
+	int64	sc_r9;
+	int64	sc_r10;
+	int64	sc_r11;
+	int64	sc_r12;
+	int64	sc_r13;
+	int64	sc_r14;
+	int64	sc_r15;
+	int64	sc_rbp;
+	int64	sc_rbx;
+	int64	sc_rax;
+	int64	sc_gs;
+	int64	sc_fs;
+	int64	sc_es;
+	int64	sc_ds;
+	int64	sc_trapno;
+	int64	sc_err;
+	int64	sc_rip;
+	int64	sc_cs;
+	int64	sc_rflags;
+	int64	sc_rsp;
+	int64	sc_ss;
+	void	*sc_fpstate;
+	int32	__sc_unused;
+	int32	sc_mask;
+};
+struct Siginfo {
+	int32	si_signo;
+	int32	si_code;
+	int32	si_errno;
+	byte	Pad_cgo_0[4];
+	byte	_data[120];
+};
+typedef	uint32	Sigset;
+typedef	byte	Sigval[8];
+
+struct StackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int64	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct KeventT {
+	uint64	ident;
+	int16	filter;
+	uint16	flags;
+	uint32	fflags;
+	int64	data;
+	byte	*udata;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_plan9_386.h b/src/runtime/defs_plan9_386.h
new file mode 100644
index 000000000..a762b8589
--- /dev/null
+++ b/src/runtime/defs_plan9_386.h
@@ -0,0 +1,26 @@
+#define PAGESIZE 0x1000
+
+typedef struct Ureg Ureg;
+
+struct Ureg
+{
+	uint32	di;		/* general registers */
+	uint32	si;		/* ... */
+	uint32	bp;		/* ... */
+	uint32	nsp;
+	uint32	bx;		/* ... */
+	uint32	dx;		/* ... */
+	uint32	cx;		/* ... */
+	uint32	ax;		/* ... */
+	uint32	gs;		/* data segments */
+	uint32	fs;		/* ... */
+	uint32	es;		/* ... */
+	uint32	ds;		/* ... */
+	uint32	trap;		/* trap type */
+	uint32	ecode;		/* error code (or zero) */
+	uint32	pc;		/* pc */
+	uint32	cs;		/* old context */
+	uint32	flags;		/* old flags */
+	uint32	sp;
+	uint32	ss;		/* old stack segment */
+};
diff --git a/src/runtime/defs_plan9_amd64.h b/src/runtime/defs_plan9_amd64.h
new file mode 100644
index 000000000..20bca479c
--- /dev/null
+++ b/src/runtime/defs_plan9_amd64.h
@@ -0,0 +1,34 @@
+#define PAGESIZE 0x1000
+
+typedef struct Ureg Ureg;
+
+struct Ureg {
+	uint64	ax;
+	uint64	bx;
+	uint64	cx;
+	uint64	dx;
+	uint64	si;
+	uint64	di;
+	uint64	bp;
+	uint64	r8;
+	uint64	r9;
+	uint64	r10;
+	uint64	r11;
+	uint64	r12;
+	uint64	r13;
+	uint64	r14;
+	uint64	r15;
+
+	uint16	ds;
+	uint16	es;
+	uint16	fs;
+	uint16	gs;
+
+	uint64	type;
+	uint64	error;				/* error code (or zero) */
+	uint64	ip;				/* pc */
+	uint64	cs;				/* old context */
+	uint64	flags;				/* old flags */
+	uint64	sp;				/* sp */
+	uint64	ss;				/* old stack segment */
+};
diff --git a/src/runtime/defs_solaris.go b/src/runtime/defs_solaris.go
new file mode 100644
index 000000000..ba44e5fd4
--- /dev/null
+++ b/src/runtime/defs_solaris.go
@@ -0,0 +1,156 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_solaris.go >defs_solaris_amd64.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/select.h>
+#include <sys/siginfo.h>
+#include <sys/signal.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/ucontext.h>
+#include <sys/regset.h>
+#include <sys/unistd.h>
+#include <sys/fork.h>
+#include <sys/port.h>
+#include <semaphore.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+#include <netdb.h>
+*/
+import "C"
+
+const (
+	EINTR       = C.EINTR
+	EBADF       = C.EBADF
+	EFAULT      = C.EFAULT
+	EAGAIN      = C.EAGAIN
+	ETIMEDOUT   = C.ETIMEDOUT
+	EWOULDBLOCK = C.EWOULDBLOCK
+	EINPROGRESS = C.EINPROGRESS
+
+	PROT_NONE  = C.PROT_NONE
+	PROT_READ  = C.PROT_READ
+	PROT_WRITE = C.PROT_WRITE
+	PROT_EXEC  = C.PROT_EXEC
+
+	MAP_ANON    = C.MAP_ANON
+	MAP_PRIVATE = C.MAP_PRIVATE
+	MAP_FIXED   = C.MAP_FIXED
+
+	MADV_FREE = C.MADV_FREE
+
+	SA_SIGINFO = C.SA_SIGINFO
+	SA_RESTART = C.SA_RESTART
+	SA_ONSTACK = C.SA_ONSTACK
+
+	SIGHUP    = C.SIGHUP
+	SIGINT    = C.SIGINT
+	SIGQUIT   = C.SIGQUIT
+	SIGILL    = C.SIGILL
+	SIGTRAP   = C.SIGTRAP
+	SIGABRT   = C.SIGABRT
+	SIGEMT    = C.SIGEMT
+	SIGFPE    = C.SIGFPE
+	SIGKILL   = C.SIGKILL
+	SIGBUS    = C.SIGBUS
+	SIGSEGV   = C.SIGSEGV
+	SIGSYS    = C.SIGSYS
+	SIGPIPE   = C.SIGPIPE
+	SIGALRM   = C.SIGALRM
+	SIGTERM   = C.SIGTERM
+	SIGURG    = C.SIGURG
+	SIGSTOP   = C.SIGSTOP
+	SIGTSTP   = C.SIGTSTP
+	SIGCONT   = C.SIGCONT
+	SIGCHLD   = C.SIGCHLD
+	SIGTTIN   = C.SIGTTIN
+	SIGTTOU   = C.SIGTTOU
+	SIGIO     = C.SIGIO
+	SIGXCPU   = C.SIGXCPU
+	SIGXFSZ   = C.SIGXFSZ
+	SIGVTALRM = C.SIGVTALRM
+	SIGPROF   = C.SIGPROF
+	SIGWINCH  = C.SIGWINCH
+	SIGUSR1   = C.SIGUSR1
+	SIGUSR2   = C.SIGUSR2
+
+	FPE_INTDIV = C.FPE_INTDIV
+	FPE_INTOVF = C.FPE_INTOVF
+	FPE_FLTDIV = C.FPE_FLTDIV
+	FPE_FLTOVF = C.FPE_FLTOVF
+	FPE_FLTUND = C.FPE_FLTUND
+	FPE_FLTRES = C.FPE_FLTRES
+	FPE_FLTINV = C.FPE_FLTINV
+	FPE_FLTSUB = C.FPE_FLTSUB
+
+	BUS_ADRALN = C.BUS_ADRALN
+	BUS_ADRERR = C.BUS_ADRERR
+	BUS_OBJERR = C.BUS_OBJERR
+
+	SEGV_MAPERR = C.SEGV_MAPERR
+	SEGV_ACCERR = C.SEGV_ACCERR
+
+	ITIMER_REAL    = C.ITIMER_REAL
+	ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
+	ITIMER_PROF    = C.ITIMER_PROF
+
+	_SC_NPROCESSORS_ONLN = C._SC_NPROCESSORS_ONLN
+
+	PTHREAD_CREATE_DETACHED = C.PTHREAD_CREATE_DETACHED
+
+	FORK_NOSIGCHLD = C.FORK_NOSIGCHLD
+	FORK_WAITPID   = C.FORK_WAITPID
+
+	MAXHOSTNAMELEN = C.MAXHOSTNAMELEN
+
+	O_NONBLOCK = C.O_NONBLOCK
+	FD_CLOEXEC = C.FD_CLOEXEC
+	F_GETFL    = C.F_GETFL
+	F_SETFL    = C.F_SETFL
+	F_SETFD    = C.F_SETFD
+
+	POLLIN  = C.POLLIN
+	POLLOUT = C.POLLOUT
+	POLLHUP = C.POLLHUP
+	POLLERR = C.POLLERR
+
+	PORT_SOURCE_FD = C.PORT_SOURCE_FD
+)
+
+type SemT C.sem_t
+
+type SigaltstackT C.struct_sigaltstack
+type Sigset C.sigset_t
+type StackT C.stack_t
+
+type Siginfo C.siginfo_t
+type Sigaction C.struct_sigaction
+
+type Fpregset C.fpregset_t
+type Mcontext C.mcontext_t
+type Ucontext C.ucontext_t
+
+type Timespec C.struct_timespec
+type Timeval C.struct_timeval
+type Itimerval C.struct_itimerval
+
+type PortEvent C.port_event_t
+type Pthread C.pthread_t
+type PthreadAttr C.pthread_attr_t
+
+// depends on Timespec, must appear below
+type Stat C.struct_stat
diff --git a/src/runtime/defs_solaris_amd64.go b/src/runtime/defs_solaris_amd64.go
new file mode 100644
index 000000000..049317888
--- /dev/null
+++ b/src/runtime/defs_solaris_amd64.go
@@ -0,0 +1,48 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_solaris.go defs_solaris_amd64.go >defs_solaris_amd64.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <sys/regset.h>
+*/
+import "C"
+
+const (
+	REG_RDI    = C.REG_RDI
+	REG_RSI    = C.REG_RSI
+	REG_RDX    = C.REG_RDX
+	REG_RCX    = C.REG_RCX
+	REG_R8     = C.REG_R8
+	REG_R9     = C.REG_R9
+	REG_R10    = C.REG_R10
+	REG_R11    = C.REG_R11
+	REG_R12    = C.REG_R12
+	REG_R13    = C.REG_R13
+	REG_R14    = C.REG_R14
+	REG_R15    = C.REG_R15
+	REG_RBP    = C.REG_RBP
+	REG_RBX    = C.REG_RBX
+	REG_RAX    = C.REG_RAX
+	REG_GS     = C.REG_GS
+	REG_FS     = C.REG_FS
+	REG_ES     = C.REG_ES
+	REG_DS     = C.REG_DS
+	REG_TRAPNO = C.REG_TRAPNO
+	REG_ERR    = C.REG_ERR
+	REG_RIP    = C.REG_RIP
+	REG_CS     = C.REG_CS
+	REG_RFLAGS = C.REG_RFL
+	REG_RSP    = C.REG_RSP
+	REG_SS     = C.REG_SS
+)
diff --git a/src/runtime/defs_solaris_amd64.h b/src/runtime/defs_solaris_amd64.h
new file mode 100644
index 000000000..cb1cfeadc
--- /dev/null
+++ b/src/runtime/defs_solaris_amd64.h
@@ -0,0 +1,254 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_solaris.go defs_solaris_amd64.go
+
+
+enum {
+	EINTR		= 0x4,
+	EBADF		= 0x9,
+	EFAULT		= 0xe,
+	EAGAIN		= 0xb,
+	ETIMEDOUT	= 0x91,
+	EWOULDBLOCK	= 0xb,
+	EINPROGRESS	= 0x96,
+
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x100,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x8,
+	SA_RESTART	= 0x4,
+	SA_ONSTACK	= 0x1,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x15,
+	SIGSTOP		= 0x17,
+	SIGTSTP		= 0x18,
+	SIGCONT		= 0x19,
+	SIGCHLD		= 0x12,
+	SIGTTIN		= 0x1a,
+	SIGTTOU		= 0x1b,
+	SIGIO		= 0x16,
+	SIGXCPU		= 0x1e,
+	SIGXFSZ		= 0x1f,
+	SIGVTALRM	= 0x1c,
+	SIGPROF		= 0x1d,
+	SIGWINCH	= 0x14,
+	SIGUSR1		= 0x10,
+	SIGUSR2		= 0x11,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+
+	_SC_NPROCESSORS_ONLN	= 0xf,
+
+	PTHREAD_CREATE_DETACHED	= 0x40,
+
+	FORK_NOSIGCHLD	= 0x1,
+	FORK_WAITPID	= 0x2,
+
+	MAXHOSTNAMELEN	= 0x100,
+
+	O_NONBLOCK	= 0x80,
+	FD_CLOEXEC	= 0x1,
+	F_GETFL		= 0x3,
+	F_SETFL		= 0x4,
+	F_SETFD		= 0x2,
+
+	POLLIN	= 0x1,
+	POLLOUT	= 0x4,
+	POLLHUP	= 0x10,
+	POLLERR	= 0x8,
+
+	PORT_SOURCE_FD	= 0x4,
+};
+
+typedef struct SemT SemT;
+typedef struct SigaltstackT SigaltstackT;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct SigactionT SigactionT;
+typedef struct Fpregset Fpregset;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct PortEvent PortEvent;
+typedef struct PthreadAttr PthreadAttr;
+typedef struct Stat Stat;
+
+#pragma pack on
+
+struct SemT {
+	uint32	sem_count;
+	uint16	sem_type;
+	uint16	sem_magic;
+	uint64	sem_pad1[3];
+	uint64	sem_pad2[2];
+};
+
+struct SigaltstackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+struct Sigset {
+	uint32	__sigbits[4];
+};
+struct StackT {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_code;
+	int32	si_errno;
+	int32	si_pad;
+	byte	__data[240];
+};
+struct SigactionT {
+	int32	sa_flags;
+	byte	Pad_cgo_0[4];
+	byte	_funcptr[8];
+	Sigset	sa_mask;
+};
+
+struct Fpregset {
+	byte	fp_reg_set[528];
+};
+struct Mcontext {
+	int64	gregs[28];
+	Fpregset	fpregs;
+};
+struct Ucontext {
+	uint64	uc_flags;
+	Ucontext	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	byte	Pad_cgo_0[8];
+	Mcontext	uc_mcontext;
+	int64	uc_filler[5];
+	byte	Pad_cgo_1[8];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int64	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int64	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct PortEvent {
+	int32	portev_events;
+	uint16	portev_source;
+	uint16	portev_pad;
+	uint64	portev_object;
+	byte	*portev_user;
+};
+typedef	uint32	Pthread;
+struct PthreadAttr {
+	byte	*__pthread_attrp;
+};
+
+struct Stat {
+	uint64	st_dev;
+	uint64	st_ino;
+	uint32	st_mode;
+	uint32	st_nlink;
+	uint32	st_uid;
+	uint32	st_gid;
+	uint64	st_rdev;
+	int64	st_size;
+	Timespec	st_atim;
+	Timespec	st_mtim;
+	Timespec	st_ctim;
+	int32	st_blksize;
+	byte	Pad_cgo_0[4];
+	int64	st_blocks;
+	int8	st_fstype[16];
+};
+
+
+#pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_solaris.go defs_solaris_amd64.go
+
+
+enum {
+	REG_RDI		= 0x8,
+	REG_RSI		= 0x9,
+	REG_RDX		= 0xc,
+	REG_RCX		= 0xd,
+	REG_R8		= 0x7,
+	REG_R9		= 0x6,
+	REG_R10		= 0x5,
+	REG_R11		= 0x4,
+	REG_R12		= 0x3,
+	REG_R13		= 0x2,
+	REG_R14		= 0x1,
+	REG_R15		= 0x0,
+	REG_RBP		= 0xa,
+	REG_RBX		= 0xb,
+	REG_RAX		= 0xe,
+	REG_GS		= 0x17,
+	REG_FS		= 0x16,
+	REG_ES		= 0x18,
+	REG_DS		= 0x19,
+	REG_TRAPNO	= 0xf,
+	REG_ERR		= 0x10,
+	REG_RIP		= 0x11,
+	REG_CS		= 0x12,
+	REG_RFLAGS	= 0x13,
+	REG_RSP		= 0x14,
+	REG_SS		= 0x15,
+};
+
diff --git a/src/runtime/defs_windows.go b/src/runtime/defs_windows.go
new file mode 100644
index 000000000..7ce679741
--- /dev/null
+++ b/src/runtime/defs_windows.go
@@ -0,0 +1,73 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_windows.go > defs_windows_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_windows.go > defs_windows_386.h
+*/
+
+package runtime
+
+/*
+#include <signal.h>
+#include <stdarg.h>
+#include <windef.h>
+#include <winbase.h>
+#include <wincon.h>
+
+#ifndef _X86_
+typedef struct {} FLOATING_SAVE_AREA;
+#endif
+#ifndef _AMD64_
+typedef struct {} M128A;
+#endif
+*/
+import "C"
+
+const (
+	PROT_NONE  = 0
+	PROT_READ  = 1
+	PROT_WRITE = 2
+	PROT_EXEC  = 4
+
+	MAP_ANON    = 1
+	MAP_PRIVATE = 2
+
+	DUPLICATE_SAME_ACCESS   = C.DUPLICATE_SAME_ACCESS
+	THREAD_PRIORITY_HIGHEST = C.THREAD_PRIORITY_HIGHEST
+
+	SIGINT           = C.SIGINT
+	CTRL_C_EVENT     = C.CTRL_C_EVENT
+	CTRL_BREAK_EVENT = C.CTRL_BREAK_EVENT
+
+	CONTEXT_CONTROL = C.CONTEXT_CONTROL
+	CONTEXT_FULL    = C.CONTEXT_FULL
+
+	EXCEPTION_ACCESS_VIOLATION     = C.STATUS_ACCESS_VIOLATION
+	EXCEPTION_BREAKPOINT           = C.STATUS_BREAKPOINT
+	EXCEPTION_FLT_DENORMAL_OPERAND = C.STATUS_FLOAT_DENORMAL_OPERAND
+	EXCEPTION_FLT_DIVIDE_BY_ZERO   = C.STATUS_FLOAT_DIVIDE_BY_ZERO
+	EXCEPTION_FLT_INEXACT_RESULT   = C.STATUS_FLOAT_INEXACT_RESULT
+	EXCEPTION_FLT_OVERFLOW         = C.STATUS_FLOAT_OVERFLOW
+	EXCEPTION_FLT_UNDERFLOW        = C.STATUS_FLOAT_UNDERFLOW
+	EXCEPTION_INT_DIVIDE_BY_ZERO   = C.STATUS_INTEGER_DIVIDE_BY_ZERO
+	EXCEPTION_INT_OVERFLOW         = C.STATUS_INTEGER_OVERFLOW
+
+	INFINITE     = C.INFINITE
+	WAIT_TIMEOUT = C.WAIT_TIMEOUT
+
+	EXCEPTION_CONTINUE_EXECUTION = C.EXCEPTION_CONTINUE_EXECUTION
+	EXCEPTION_CONTINUE_SEARCH    = C.EXCEPTION_CONTINUE_SEARCH
+)
+
+type SystemInfo C.SYSTEM_INFO
+type ExceptionRecord C.EXCEPTION_RECORD
+type FloatingSaveArea C.FLOATING_SAVE_AREA
+type M128a C.M128A
+type Context C.CONTEXT
+type Overlapped C.OVERLAPPED
diff --git a/src/runtime/defs_windows_386.h b/src/runtime/defs_windows_386.h
new file mode 100644
index 000000000..2317c04f6
--- /dev/null
+++ b/src/runtime/defs_windows_386.h
@@ -0,0 +1,116 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_windows.go
+
+
+enum {
+	PROT_NONE	= 0,
+	PROT_READ	= 1,
+	PROT_WRITE	= 2,
+	PROT_EXEC	= 4,
+
+	MAP_ANON	= 1,
+	MAP_PRIVATE	= 2,
+
+	DUPLICATE_SAME_ACCESS	= 0x2,
+	THREAD_PRIORITY_HIGHEST	= 0x2,
+
+	SIGINT			= 0x2,
+	CTRL_C_EVENT		= 0x0,
+	CTRL_BREAK_EVENT	= 0x1,
+
+	CONTEXT_CONTROL	= 0x10001,
+	CONTEXT_FULL	= 0x10007,
+
+	EXCEPTION_ACCESS_VIOLATION	= 0xc0000005,
+	EXCEPTION_BREAKPOINT		= 0x80000003,
+	EXCEPTION_FLT_DENORMAL_OPERAND	= 0xc000008d,
+	EXCEPTION_FLT_DIVIDE_BY_ZERO	= 0xc000008e,
+	EXCEPTION_FLT_INEXACT_RESULT	= 0xc000008f,
+	EXCEPTION_FLT_OVERFLOW		= 0xc0000091,
+	EXCEPTION_FLT_UNDERFLOW		= 0xc0000093,
+	EXCEPTION_INT_DIVIDE_BY_ZERO	= 0xc0000094,
+	EXCEPTION_INT_OVERFLOW		= 0xc0000095,
+
+	INFINITE	= 0xffffffff,
+	WAIT_TIMEOUT	= 0x102,
+
+	EXCEPTION_CONTINUE_EXECUTION	= -0x1,
+	EXCEPTION_CONTINUE_SEARCH	= 0x0,
+};
+
+typedef struct SystemInfo SystemInfo;
+typedef struct ExceptionRecord ExceptionRecord;
+typedef struct FloatingSaveArea FloatingSaveArea;
+typedef struct M128a M128a;
+typedef struct Context Context;
+typedef struct Overlapped Overlapped;
+
+#pragma pack on
+
+struct SystemInfo {
+	byte	anon0[4];
+	uint32	dwPageSize;
+	byte	*lpMinimumApplicationAddress;
+	byte	*lpMaximumApplicationAddress;
+	uint32	dwActiveProcessorMask;
+	uint32	dwNumberOfProcessors;
+	uint32	dwProcessorType;
+	uint32	dwAllocationGranularity;
+	uint16	wProcessorLevel;
+	uint16	wProcessorRevision;
+};
+struct ExceptionRecord {
+	uint32	ExceptionCode;
+	uint32	ExceptionFlags;
+	ExceptionRecord	*ExceptionRecord;
+	byte	*ExceptionAddress;
+	uint32	NumberParameters;
+	uint32	ExceptionInformation[15];
+};
+struct FloatingSaveArea {
+	uint32	ControlWord;
+	uint32	StatusWord;
+	uint32	TagWord;
+	uint32	ErrorOffset;
+	uint32	ErrorSelector;
+	uint32	DataOffset;
+	uint32	DataSelector;
+	uint8	RegisterArea[80];
+	uint32	Cr0NpxState;
+};
+struct Context {
+	uint32	ContextFlags;
+	uint32	Dr0;
+	uint32	Dr1;
+	uint32	Dr2;
+	uint32	Dr3;
+	uint32	Dr6;
+	uint32	Dr7;
+	FloatingSaveArea	FloatSave;
+	uint32	SegGs;
+	uint32	SegFs;
+	uint32	SegEs;
+	uint32	SegDs;
+	uint32	Edi;
+	uint32	Esi;
+	uint32	Ebx;
+	uint32	Edx;
+	uint32	Ecx;
+	uint32	Eax;
+	uint32	Ebp;
+	uint32	Eip;
+	uint32	SegCs;
+	uint32	EFlags;
+	uint32	Esp;
+	uint32	SegSs;
+	uint8	ExtendedRegisters[512];
+};
+struct Overlapped {
+	uint32	Internal;
+	uint32	InternalHigh;
+	byte	anon0[8];
+	byte	*hEvent;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/defs_windows_amd64.h b/src/runtime/defs_windows_amd64.h
new file mode 100644
index 000000000..7f37a7a8c
--- /dev/null
+++ b/src/runtime/defs_windows_amd64.h
@@ -0,0 +1,131 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_windows.go
+
+
+enum {
+	PROT_NONE	= 0,
+	PROT_READ	= 1,
+	PROT_WRITE	= 2,
+	PROT_EXEC	= 4,
+
+	MAP_ANON	= 1,
+	MAP_PRIVATE	= 2,
+
+	DUPLICATE_SAME_ACCESS	= 0x2,
+	THREAD_PRIORITY_HIGHEST	= 0x2,
+
+	SIGINT			= 0x2,
+	CTRL_C_EVENT		= 0x0,
+	CTRL_BREAK_EVENT	= 0x1,
+
+	CONTEXT_CONTROL	= 0x100001,
+	CONTEXT_FULL	= 0x10000b,
+
+	EXCEPTION_ACCESS_VIOLATION	= 0xc0000005,
+	EXCEPTION_BREAKPOINT		= 0x80000003,
+	EXCEPTION_FLT_DENORMAL_OPERAND	= 0xc000008d,
+	EXCEPTION_FLT_DIVIDE_BY_ZERO	= 0xc000008e,
+	EXCEPTION_FLT_INEXACT_RESULT	= 0xc000008f,
+	EXCEPTION_FLT_OVERFLOW		= 0xc0000091,
+	EXCEPTION_FLT_UNDERFLOW		= 0xc0000093,
+	EXCEPTION_INT_DIVIDE_BY_ZERO	= 0xc0000094,
+	EXCEPTION_INT_OVERFLOW		= 0xc0000095,
+
+	INFINITE	= 0xffffffff,
+	WAIT_TIMEOUT	= 0x102,
+
+	EXCEPTION_CONTINUE_EXECUTION	= -0x1,
+	EXCEPTION_CONTINUE_SEARCH	= 0x0,
+};
+
+typedef struct SystemInfo SystemInfo;
+typedef struct ExceptionRecord ExceptionRecord;
+typedef struct FloatingSaveArea FloatingSaveArea;
+typedef struct M128a M128a;
+typedef struct Context Context;
+typedef struct Overlapped Overlapped;
+
+#pragma pack on
+
+struct SystemInfo {
+	byte	anon0[4];
+	uint32	dwPageSize;
+	byte	*lpMinimumApplicationAddress;
+	byte	*lpMaximumApplicationAddress;
+	uint64	dwActiveProcessorMask;
+	uint32	dwNumberOfProcessors;
+	uint32	dwProcessorType;
+	uint32	dwAllocationGranularity;
+	uint16	wProcessorLevel;
+	uint16	wProcessorRevision;
+};
+struct ExceptionRecord {
+	uint32	ExceptionCode;
+	uint32	ExceptionFlags;
+	ExceptionRecord	*ExceptionRecord;
+	byte	*ExceptionAddress;
+	uint32	NumberParameters;
+	byte	Pad_cgo_0[4];
+	uint64	ExceptionInformation[15];
+};
+struct M128a {
+	uint64	Low;
+	int64	High;
+};
+struct Context {
+	uint64	P1Home;
+	uint64	P2Home;
+	uint64	P3Home;
+	uint64	P4Home;
+	uint64	P5Home;
+	uint64	P6Home;
+	uint32	ContextFlags;
+	uint32	MxCsr;
+	uint16	SegCs;
+	uint16	SegDs;
+	uint16	SegEs;
+	uint16	SegFs;
+	uint16	SegGs;
+	uint16	SegSs;
+	uint32	EFlags;
+	uint64	Dr0;
+	uint64	Dr1;
+	uint64	Dr2;
+	uint64	Dr3;
+	uint64	Dr6;
+	uint64	Dr7;
+	uint64	Rax;
+	uint64	Rcx;
+	uint64	Rdx;
+	uint64	Rbx;
+	uint64	Rsp;
+	uint64	Rbp;
+	uint64	Rsi;
+	uint64	Rdi;
+	uint64	R8;
+	uint64	R9;
+	uint64	R10;
+	uint64	R11;
+	uint64	R12;
+	uint64	R13;
+	uint64	R14;
+	uint64	R15;
+	uint64	Rip;
+	byte	anon0[512];
+	M128a	VectorRegister[26];
+	uint64	VectorControl;
+	uint64	DebugControl;
+	uint64	LastBranchToRip;
+	uint64	LastBranchFromRip;
+	uint64	LastExceptionToRip;
+	uint64	LastExceptionFromRip;
+};
+struct Overlapped {
+	uint64	Internal;
+	uint64	InternalHigh;
+	byte	anon0[8];
+	byte	*hEvent;
+};
+
+
+#pragma pack off
diff --git a/src/runtime/env_plan9.go b/src/runtime/env_plan9.go
new file mode 100644
index 000000000..e442c3483
--- /dev/null
+++ b/src/runtime/env_plan9.go
@@ -0,0 +1,56 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func getenv(s *byte) *byte {
+	val := gogetenv(gostringnocopy(s))
+	if val == "" {
+		return nil
+	}
+	// Strings found in environment are NUL-terminated.
+	return &bytes(val)[0]
+}
+
+var tracebackbuf [128]byte
+
+func gogetenv(key string) string {
+	var file [128]byte
+	if len(key) > len(file)-6 {
+		return ""
+	}
+
+	copy(file[:], "/env/")
+	copy(file[5:], key)
+
+	fd := open(&file[0], _OREAD, 0)
+	if fd < 0 {
+		return ""
+	}
+	n := seek(fd, 0, 2)
+	if n <= 0 {
+		close(fd)
+		return ""
+	}
+
+	p := make([]byte, n)
+
+	r := pread(fd, unsafe.Pointer(&p[0]), int32(n), 0)
+	close(fd)
+	if r < 0 {
+		return ""
+	}
+
+	if p[r-1] == 0 {
+		r--
+	}
+
+	var s string
+	sp := (*_string)(unsafe.Pointer(&s))
+	sp.str = &p[0]
+	sp.len = int(r)
+	return s
+}
diff --git a/src/runtime/env_posix.go b/src/runtime/env_posix.go
new file mode 100644
index 000000000..dd57872d7
--- /dev/null
+++ b/src/runtime/env_posix.go
@@ -0,0 +1,63 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+
+package runtime
+
+import "unsafe"
+
+func environ() []string
+
+func getenv(s *byte) *byte {
+	val := gogetenv(gostringnocopy(s))
+	if val == "" {
+		return nil
+	}
+	// Strings found in environment are NUL-terminated.
+	return &bytes(val)[0]
+}
+
+func gogetenv(key string) string {
+	env := environ()
+	if env == nil {
+		gothrow("getenv before env init")
+	}
+	for _, s := range environ() {
+		if len(s) > len(key) && s[len(key)] == '=' && s[:len(key)] == key {
+			return s[len(key)+1:]
+		}
+	}
+	return ""
+}
+
+var _cgo_setenv uintptr   // pointer to C function
+var _cgo_unsetenv uintptr // pointer to C function
+
+// Update the C environment if cgo is loaded.
+// Called from syscall.Setenv.
+func syscall_setenv_c(k string, v string) {
+	if _cgo_setenv == 0 {
+		return
+	}
+	arg := [2]unsafe.Pointer{cstring(k), cstring(v)}
+	asmcgocall(unsafe.Pointer(_cgo_setenv), unsafe.Pointer(&arg))
+}
+
+// Update the C environment if cgo is loaded.
+// Called from syscall.unsetenv.
+func syscall_unsetenv_c(k string) {
+	if _cgo_unsetenv == 0 {
+		return
+	}
+	arg := [1]unsafe.Pointer{cstring(k)}
+	asmcgocall(unsafe.Pointer(_cgo_unsetenv), unsafe.Pointer(&arg))
+}
+
+func cstring(s string) unsafe.Pointer {
+	p := make([]byte, len(s)+1)
+	sp := (*_string)(unsafe.Pointer(&s))
+	memmove(unsafe.Pointer(&p[0]), unsafe.Pointer(sp.str), uintptr(len(s)))
+	return unsafe.Pointer(&p[0])
+}
diff --git a/src/runtime/error.go b/src/runtime/error.go
new file mode 100644
index 000000000..0b40c702b
--- /dev/null
+++ b/src/runtime/error.go
@@ -0,0 +1,107 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// The Error interface identifies a run time error.
+type Error interface {
+	error
+
+	// RuntimeError is a no-op function but
+	// serves to distinguish types that are runtime
+	// errors from ordinary errors: a type is a
+	// runtime error if it has a RuntimeError method.
+	RuntimeError()
+}
+
+// A TypeAssertionError explains a failed type assertion.
+type TypeAssertionError struct {
+	interfaceString string
+	concreteString  string
+	assertedString  string
+	missingMethod   string // one method needed by Interface, missing from Concrete
+}
+
+func (*TypeAssertionError) RuntimeError() {}
+
+func (e *TypeAssertionError) Error() string {
+	inter := e.interfaceString
+	if inter == "" {
+		inter = "interface"
+	}
+	if e.concreteString == "" {
+		return "interface conversion: " + inter + " is nil, not " + e.assertedString
+	}
+	if e.missingMethod == "" {
+		return "interface conversion: " + inter + " is " + e.concreteString +
+			", not " + e.assertedString
+	}
+	return "interface conversion: " + e.concreteString + " is not " + e.assertedString +
+		": missing method " + e.missingMethod
+}
+
+// For calling from C.
+func newTypeAssertionError(ps1, ps2, ps3 *string, pmeth *string, ret *interface{}) {
+	var s1, s2, s3, meth string
+
+	if ps1 != nil {
+		s1 = *ps1
+	}
+	if ps2 != nil {
+		s2 = *ps2
+	}
+	if ps3 != nil {
+		s3 = *ps3
+	}
+	if pmeth != nil {
+		meth = *pmeth
+	}
+	*ret = &TypeAssertionError{s1, s2, s3, meth}
+}
+
+// An errorString represents a runtime error described by a single string.
+type errorString string
+
+func (e errorString) RuntimeError() {}
+
+func (e errorString) Error() string {
+	return "runtime error: " + string(e)
+}
+
+type stringer interface {
+	String() string
+}
+
+func typestring(x interface{}) string {
+	e := (*eface)(unsafe.Pointer(&x))
+	return *e._type._string
+}
+
+// For calling from C.
+// Prints an argument passed to panic.
+// There's room for arbitrary complexity here, but we keep it
+// simple and handle just a few important cases: int, string, and Stringer.
+func printany(i interface{}) {
+	switch v := i.(type) {
+	case nil:
+		print("nil")
+	case stringer:
+		print(v.String())
+	case error:
+		print(v.Error())
+	case int:
+		print(v)
+	case string:
+		print(v)
+	default:
+		print("(", typestring(i), ") ", i)
+	}
+}
+
+// called from generated code
+func panicwrap(pkg, typ, meth string) {
+	panic("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer")
+}
diff --git a/src/runtime/export_futex_test.go b/src/runtime/export_futex_test.go
new file mode 100644
index 000000000..96281f650
--- /dev/null
+++ b/src/runtime/export_futex_test.go
@@ -0,0 +1,10 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build dragonfly freebsd linux
+
+package runtime
+
+var Futexsleep = futexsleep
+var Futexwakeup = futexwakeup
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
new file mode 100644
index 000000000..be352557f
--- /dev/null
+++ b/src/runtime/export_test.go
@@ -0,0 +1,165 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Export guts for testing.
+
+package runtime
+
+import "unsafe"
+
+var Fadd64 = fadd64
+var Fsub64 = fsub64
+var Fmul64 = fmul64
+var Fdiv64 = fdiv64
+var F64to32 = f64to32
+var F32to64 = f32to64
+var Fcmp64 = fcmp64
+var Fintto64 = fintto64
+var F64toint = f64toint
+
+// in asm_*.s
+func stackguard() (sp, limit uintptr)
+
+var Entersyscall = entersyscall
+var Exitsyscall = exitsyscall
+var LockedOSThread = lockedOSThread
+
+type LFNode struct {
+	Next    *LFNode
+	Pushcnt uintptr
+}
+
+func lfstackpush_m()
+func lfstackpop_m()
+
+func LFStackPush(head *uint64, node *LFNode) {
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(head)
+	mp.ptrarg[1] = unsafe.Pointer(node)
+	onM(lfstackpush_m)
+	releasem(mp)
+}
+
+func LFStackPop(head *uint64) *LFNode {
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(head)
+	onM(lfstackpop_m)
+	node := (*LFNode)(unsafe.Pointer(mp.ptrarg[0]))
+	mp.ptrarg[0] = nil
+	releasem(mp)
+	return node
+}
+
+type ParFor struct {
+	body    *byte
+	done    uint32
+	Nthr    uint32
+	nthrmax uint32
+	thrseq  uint32
+	Cnt     uint32
+	Ctx     *byte
+	wait    bool
+}
+
+func newparfor_m()
+func parforsetup_m()
+func parfordo_m()
+func parforiters_m()
+
+func NewParFor(nthrmax uint32) *ParFor {
+	mp := acquirem()
+	mp.scalararg[0] = uintptr(nthrmax)
+	onM(newparfor_m)
+	desc := (*ParFor)(mp.ptrarg[0])
+	mp.ptrarg[0] = nil
+	releasem(mp)
+	return desc
+}
+
+func ParForSetup(desc *ParFor, nthr, n uint32, ctx *byte, wait bool, body func(*ParFor, uint32)) {
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(desc)
+	mp.ptrarg[1] = unsafe.Pointer(ctx)
+	mp.ptrarg[2] = unsafe.Pointer(funcPC(body)) // TODO(rsc): Should be a scalar.
+	mp.scalararg[0] = uintptr(nthr)
+	mp.scalararg[1] = uintptr(n)
+	mp.scalararg[2] = 0
+	if wait {
+		mp.scalararg[2] = 1
+	}
+	onM(parforsetup_m)
+	releasem(mp)
+}
+
+func ParForDo(desc *ParFor) {
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(desc)
+	onM(parfordo_m)
+	releasem(mp)
+}
+
+func ParForIters(desc *ParFor, tid uint32) (uint32, uint32) {
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(desc)
+	mp.scalararg[0] = uintptr(tid)
+	onM(parforiters_m)
+	begin := uint32(mp.scalararg[0])
+	end := uint32(mp.scalararg[1])
+	releasem(mp)
+	return begin, end
+}
+
+// in mgc0.c
+//go:noescape
+func getgcmask(data unsafe.Pointer, typ *_type, array **byte, len *uint)
+
+func GCMask(x interface{}) (ret []byte) {
+	e := (*eface)(unsafe.Pointer(&x))
+	s := (*slice)(unsafe.Pointer(&ret))
+	onM(func() {
+		getgcmask(e.data, e._type, &s.array, &s.len)
+		s.cap = s.len
+	})
+	return
+}
+
+func testSchedLocalQueue()
+func testSchedLocalQueueSteal()
+func RunSchedLocalQueueTest() {
+	onM(testSchedLocalQueue)
+}
+func RunSchedLocalQueueStealTest() {
+	onM(testSchedLocalQueueSteal)
+}
+
+var HaveGoodHash = haveGoodHash
+var StringHash = stringHash
+var BytesHash = bytesHash
+var Int32Hash = int32Hash
+var Int64Hash = int64Hash
+var EfaceHash = efaceHash
+var IfaceHash = ifaceHash
+var MemclrBytes = memclrBytes
+
+var HashLoad = &hashLoad
+
+// For testing.
+func GogoBytes() int32 {
+	return _RuntimeGogoBytes
+}
+
+// in string.c
+//go:noescape
+func gostringw(w *uint16) string
+
+// entry point for testing
+func GostringW(w []uint16) (s string) {
+	onM(func() {
+		s = gostringw(&w[0])
+	})
+	return
+}
+
+var Gostringnocopy = gostringnocopy
+var Maxstring = &maxstring
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
new file mode 100644
index 000000000..6cc5df810
--- /dev/null
+++ b/src/runtime/extern.go
@@ -0,0 +1,168 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package runtime contains operations that interact with Go's runtime system,
+such as functions to control goroutines. It also includes the low-level type information
+used by the reflect package; see reflect's documentation for the programmable
+interface to the run-time type system.
+
+Environment Variables
+
+The following environment variables ($name or %name%, depending on the host
+operating system) control the run-time behavior of Go programs. The meanings
+and use may change from release to release.
+
+The GOGC variable sets the initial garbage collection target percentage.
+A collection is triggered when the ratio of freshly allocated data to live data
+remaining after the previous collection reaches this percentage. The default
+is GOGC=100. Setting GOGC=off disables the garbage collector entirely.
+The runtime/debug package's SetGCPercent function allows changing this
+percentage at run time. See http://golang.org/pkg/runtime/debug/#SetGCPercent.
+
+The GODEBUG variable controls debug output from the runtime. GODEBUG value is
+a comma-separated list of name=val pairs. Supported names are:
+
+	allocfreetrace: setting allocfreetrace=1 causes every allocation to be
+	profiled and a stack trace printed on each object's allocation and free.
+
+	efence: setting efence=1 causes the allocator to run in a mode
+	where each object is allocated on a unique page and addresses are
+	never recycled.
+
+	gctrace: setting gctrace=1 causes the garbage collector to emit a single line to standard
+	error at each collection, summarizing the amount of memory collected and the
+	length of the pause. Setting gctrace=2 emits the same summary but also
+	repeats each collection.
+
+	gcdead: setting gcdead=1 causes the garbage collector to clobber all stack slots
+	that it thinks are dead.
+
+	invalidptr: defaults to invalidptr=1, causing the garbage collector and stack
+	copier to crash the program if an invalid pointer value (for example, 1)
+	is found in a pointer-typed location. Setting invalidptr=0 disables this check.
+	This should only be used as a temporary workaround to diagnose buggy code.
+	The real fix is to not store integers in pointer-typed locations.
+
+	scheddetail: setting schedtrace=X and scheddetail=1 causes the scheduler to emit
+	detailed multiline info every X milliseconds, describing state of the scheduler,
+	processors, threads and goroutines.
+
+	schedtrace: setting schedtrace=X causes the scheduler to emit a single line to standard
+	error every X milliseconds, summarizing the scheduler state.
+
+	scavenge: scavenge=1 enables debugging mode of heap scavenger.
+
+The GOMAXPROCS variable limits the number of operating system threads that
+can execute user-level Go code simultaneously. There is no limit to the number of threads
+that can be blocked in system calls on behalf of Go code; those do not count against
+the GOMAXPROCS limit. This package's GOMAXPROCS function queries and changes
+the limit.
+
+The GOTRACEBACK variable controls the amount of output generated when a Go
+program fails due to an unrecovered panic or an unexpected runtime condition.
+By default, a failure prints a stack trace for every extant goroutine, eliding functions
+internal to the run-time system, and then exits with exit code 2.
+If GOTRACEBACK=0, the per-goroutine stack traces are omitted entirely.
+If GOTRACEBACK=1, the default behavior is used.
+If GOTRACEBACK=2, the per-goroutine stack traces include run-time functions.
+If GOTRACEBACK=crash, the per-goroutine stack traces include run-time functions,
+and if possible the program crashes in an operating-specific manner instead of
+exiting. For example, on Unix systems, the program raises SIGABRT to trigger a
+core dump.
+
+The GOARCH, GOOS, GOPATH, and GOROOT environment variables complete
+the set of Go environment variables. They influence the building of Go programs
+(see http://golang.org/cmd/go and http://golang.org/pkg/go/build).
+GOARCH, GOOS, and GOROOT are recorded at compile time and made available by
+constants or functions in this package, but they do not influence the execution
+of the run-time system.
+*/
+package runtime
+
+// Caller reports file and line number information about function invocations on
+// the calling goroutine's stack.  The argument skip is the number of stack frames
+// to ascend, with 0 identifying the caller of Caller.  (For historical reasons the
+// meaning of skip differs between Caller and Callers.) The return values report the
+// program counter, file name, and line number within the file of the corresponding
+// call.  The boolean ok is false if it was not possible to recover the information.
+func Caller(skip int) (pc uintptr, file string, line int, ok bool) {
+	// Ask for two PCs: the one we were asked for
+	// and what it called, so that we can see if it
+	// "called" sigpanic.
+	var rpc [2]uintptr
+	if callers(1+skip-1, &rpc[0], 2) < 2 {
+		return
+	}
+	f := findfunc(rpc[1])
+	if f == nil {
+		// TODO(rsc): Probably a bug?
+		// The C version said "have retpc at least"
+		// but actually returned pc=0.
+		ok = true
+		return
+	}
+	pc = rpc[1]
+	xpc := pc
+	g := findfunc(rpc[0])
+	// All architectures turn faults into apparent calls to sigpanic.
+	// If we see a call to sigpanic, we do not back up the PC to find
+	// the line number of the call instruction, because there is no call.
+	if xpc > f.entry && (g == nil || g.entry != funcPC(sigpanic)) {
+		xpc--
+	}
+	line = int(funcline(f, xpc, &file))
+	ok = true
+	return
+}
+
+// Callers fills the slice pc with the return program counters of function invocations
+// on the calling goroutine's stack.  The argument skip is the number of stack frames
+// to skip before recording in pc, with 0 identifying the frame for Callers itself and
+// 1 identifying the caller of Callers.
+// It returns the number of entries written to pc.
+//
+// Note that since each slice entry pc[i] is a return program counter,
+// looking up the file and line for pc[i] (for example, using (*Func).FileLine)
+// will return the file and line number of the instruction immediately
+// following the call.
+// To look up the file and line number of the call itself, use pc[i]-1.
+// As an exception to this rule, if pc[i-1] corresponds to the function
+// runtime.sigpanic, then pc[i] is the program counter of a faulting
+// instruction and should be used without any subtraction.
+func Callers(skip int, pc []uintptr) int {
+	// runtime.callers uses pc.array==nil as a signal
+	// to print a stack trace.  Pick off 0-length pc here
+	// so that we don't let a nil pc slice get to it.
+	if len(pc) == 0 {
+		return 0
+	}
+	return callers(skip, &pc[0], len(pc))
+}
+
+// GOROOT returns the root of the Go tree.
+// It uses the GOROOT environment variable, if set,
+// or else the root used during the Go build.
+func GOROOT() string {
+	s := gogetenv("GOROOT")
+	if s != "" {
+		return s
+	}
+	return defaultGoroot
+}
+
+// Version returns the Go tree's version string.
+// It is either the commit hash and date at the time of the build or,
+// when possible, a release tag like "go1.3".
+func Version() string {
+	return theVersion
+}
+
+// GOOS is the running program's operating system target:
+// one of darwin, freebsd, linux, and so on.
+const GOOS string = theGoos
+
+// GOARCH is the running program's architecture target:
+// 386, amd64, or arm.
+const GOARCH string = theGoarch
diff --git a/src/runtime/float.c b/src/runtime/float.c
new file mode 100644
index 000000000..42082e434
--- /dev/null
+++ b/src/runtime/float.c
@@ -0,0 +1,10 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+// used as float64 via runtime· names
+uint64	·nan		= 0x7FF8000000000001ULL;
+uint64	·posinf	= 0x7FF0000000000000ULL;
+uint64	·neginf	= 0xFFF0000000000000ULL;
diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
new file mode 100644
index 000000000..d6c14fcb4
--- /dev/null
+++ b/src/runtime/funcdata.h
@@ -0,0 +1,60 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file defines the IDs for PCDATA and FUNCDATA instructions
+// in Go binaries. It is included by both C and assembly, so it must
+// be written using #defines. It is included by the runtime package
+// as well as the compilers.
+//
+// symtab.go also contains a copy of these constants.
+
+#define PCDATA_StackMapIndex 0
+
+#define FUNCDATA_ArgsPointerMaps 0 /* garbage collector blocks */
+#define FUNCDATA_LocalsPointerMaps 1
+#define FUNCDATA_DeadValueMaps 2
+
+// Pseudo-assembly statements.
+
+// GO_ARGS, GO_RESULTS_INITIALIZED, and NO_LOCAL_POINTERS are macros
+// that communicate to the runtime information about the location and liveness
+// of pointers in an assembly function's arguments, results, and stack frame.
+// This communication is only required in assembly functions that make calls
+// to other functions that might be preempted or grow the stack.
+// NOSPLIT functions that make no calls do not need to use these macros.
+
+// GO_ARGS indicates that the Go prototype for this assembly function
+// defines the pointer map for the function's arguments.
+// GO_ARGS should be the first instruction in a function that uses it.
+// It can be omitted if there are no arguments at all.
+// GO_ARGS is inserted implicitly by the linker for any function
+// that also has a Go prototype and therefore is usually not necessary
+// to write explicitly.
+#define GO_ARGS	FUNCDATA $FUNCDATA_ArgsPointerMaps, go_args_stackmap(SB)
+
+// GO_RESULTS_INITIALIZED indicates that the assembly function
+// has initialized the stack space for its results and that those results
+// should be considered live for the remainder of the function.
+#define GO_RESULTS_INITIALIZED	FUNCDATA PCDATA $PCDATA_StackMapIndex, 1
+
+// NO_LOCAL_POINTERS indicates that the assembly function stores
+// no pointers to heap objects in its local stack variables.
+#define NO_LOCAL_POINTERS	FUNCDATA $FUNCDATA_LocalsPointerMaps, runtime·no_pointers_stackmap(SB)
+
+// ArgsSizeUnknown is set in Func.argsize to mark all functions
+// whose argument size is unknown (C vararg functions, and
+// assembly code without an explicit specification).
+// This value is generated by the compiler, assembler, or linker.
+#define ArgsSizeUnknown 0x80000000
+
+/*c2go
+enum {
+	PCDATA_ArgSize = 0,
+	PCDATA_StackMapIndex = 1,
+	FUNCDATA_ArgsPointerMaps = 0,
+	FUNCDATA_LocalsPointerMaps = 1,
+	FUNCDATA_DeadValueMaps = 2,
+	ArgsSizeUnknown = 0x80000000,
+};
+*/
diff --git a/src/runtime/futex_test.go b/src/runtime/futex_test.go
new file mode 100644
index 000000000..f57fc52b8
--- /dev/null
+++ b/src/runtime/futex_test.go
@@ -0,0 +1,77 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Futex is only available on DragonFly BSD, FreeBSD and Linux.
+// The race detector emits calls to split stack functions so it breaks
+// the test.
+
+// +build dragonfly freebsd linux
+// +build !race
+
+package runtime_test
+
+import (
+	"runtime"
+	"testing"
+	"time"
+)
+
+type futexsleepTest struct {
+	mtx uint32
+	ns  int64
+	msg string
+	ch  chan futexsleepTest
+}
+
+var futexsleepTests = []futexsleepTest{
+	beforeY2038: {mtx: 0, ns: 86400 * 1e9, msg: "before the year 2038", ch: make(chan futexsleepTest, 1)},
+	afterY2038:  {mtx: 0, ns: (1<<31 + 100) * 1e9, msg: "after the year 2038", ch: make(chan futexsleepTest, 1)},
+}
+
+const (
+	beforeY2038 = iota
+	afterY2038
+)
+
+func TestFutexsleep(t *testing.T) {
+	if runtime.GOMAXPROCS(0) > 1 {
+		// futexsleep doesn't handle EINTR or other signals,
+		// so spurious wakeups may happen.
+		t.Skip("skipping; GOMAXPROCS>1")
+	}
+
+	start := time.Now()
+	for _, tt := range futexsleepTests {
+		go func(tt futexsleepTest) {
+			runtime.Entersyscall()
+			runtime.Futexsleep(&tt.mtx, tt.mtx, tt.ns)
+			runtime.Exitsyscall()
+			tt.ch <- tt
+		}(tt)
+	}
+loop:
+	for {
+		select {
+		case tt := <-futexsleepTests[beforeY2038].ch:
+			t.Errorf("futexsleep test %q finished early after %s", tt.msg, time.Since(start))
+			break loop
+		case tt := <-futexsleepTests[afterY2038].ch:
+			// Looks like FreeBSD 10 kernel has changed
+			// the semantics of timedwait on userspace
+			// mutex to make broken stuff look broken.
+			switch {
+			case runtime.GOOS == "freebsd" && runtime.GOARCH == "386":
+				t.Log("freebsd/386 may not work correctly after the year 2038, see golang.org/issue/7194")
+			default:
+				t.Errorf("futexsleep test %q finished early after %s", tt.msg, time.Since(start))
+				break loop
+			}
+		case <-time.After(time.Second):
+			break loop
+		}
+	}
+	for _, tt := range futexsleepTests {
+		runtime.Futexwakeup(&tt.mtx, 1)
+	}
+}
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
new file mode 100644
index 000000000..6abec4cca
--- /dev/null
+++ b/src/runtime/gc_test.go
@@ -0,0 +1,292 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"os"
+	"runtime"
+	"runtime/debug"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func TestGcSys(t *testing.T) {
+	if os.Getenv("GOGC") == "off" {
+		t.Skip("skipping test; GOGC=off in environment")
+	}
+	data := struct{ Short bool }{testing.Short()}
+	got := executeTest(t, testGCSysSource, &data)
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+const testGCSysSource = `
+package main
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func main() {
+	runtime.GOMAXPROCS(1)
+	memstats := new(runtime.MemStats)
+	runtime.GC()
+	runtime.ReadMemStats(memstats)
+	sys := memstats.Sys
+
+	runtime.MemProfileRate = 0 // disable profiler
+
+	itercount := 1000000
+{{if .Short}}
+	itercount = 100000
+{{end}}
+	for i := 0; i < itercount; i++ {
+		workthegc()
+	}
+
+	// Should only be using a few MB.
+	// We allocated 100 MB or (if not short) 1 GB.
+	runtime.ReadMemStats(memstats)
+	if sys > memstats.Sys {
+		sys = 0
+	} else {
+		sys = memstats.Sys - sys
+	}
+	if sys > 16<<20 {
+		fmt.Printf("using too much memory: %d bytes\n", sys)
+		return
+	}
+	fmt.Printf("OK\n")
+}
+
+func workthegc() []byte {
+	return make([]byte, 1029)
+}
+`
+
+func TestGcDeepNesting(t *testing.T) {
+	type T [2][2][2][2][2][2][2][2][2][2]*int
+	a := new(T)
+
+	// Prevent the compiler from applying escape analysis.
+	// This makes sure new(T) is allocated on heap, not on the stack.
+	t.Logf("%p", a)
+
+	a[0][0][0][0][0][0][0][0][0][0] = new(int)
+	*a[0][0][0][0][0][0][0][0][0][0] = 13
+	runtime.GC()
+	if *a[0][0][0][0][0][0][0][0][0][0] != 13 {
+		t.Fail()
+	}
+}
+
+func TestGcHashmapIndirection(t *testing.T) {
+	defer debug.SetGCPercent(debug.SetGCPercent(1))
+	runtime.GC()
+	type T struct {
+		a [256]int
+	}
+	m := make(map[T]T)
+	for i := 0; i < 2000; i++ {
+		var a T
+		a.a[0] = i
+		m[a] = T{}
+	}
+}
+
+func TestGcArraySlice(t *testing.T) {
+	type X struct {
+		buf     [1]byte
+		nextbuf []byte
+		next    *X
+	}
+	var head *X
+	for i := 0; i < 10; i++ {
+		p := &X{}
+		p.buf[0] = 42
+		p.next = head
+		if head != nil {
+			p.nextbuf = head.buf[:]
+		}
+		head = p
+		runtime.GC()
+	}
+	for p := head; p != nil; p = p.next {
+		if p.buf[0] != 42 {
+			t.Fatal("corrupted heap")
+		}
+	}
+}
+
+func TestGcRescan(t *testing.T) {
+	type X struct {
+		c     chan error
+		nextx *X
+	}
+	type Y struct {
+		X
+		nexty *Y
+		p     *int
+	}
+	var head *Y
+	for i := 0; i < 10; i++ {
+		p := &Y{}
+		p.c = make(chan error)
+		if head != nil {
+			p.nextx = &head.X
+		}
+		p.nexty = head
+		p.p = new(int)
+		*p.p = 42
+		head = p
+		runtime.GC()
+	}
+	for p := head; p != nil; p = p.nexty {
+		if *p.p != 42 {
+			t.Fatal("corrupted heap")
+		}
+	}
+}
+
+func TestGcLastTime(t *testing.T) {
+	ms := new(runtime.MemStats)
+	t0 := time.Now().UnixNano()
+	runtime.GC()
+	t1 := time.Now().UnixNano()
+	runtime.ReadMemStats(ms)
+	last := int64(ms.LastGC)
+	if t0 > last || last > t1 {
+		t.Fatalf("bad last GC time: got %v, want [%v, %v]", last, t0, t1)
+	}
+	pause := ms.PauseNs[(ms.NumGC+255)%256]
+	// Due to timer granularity, pause can actually be 0 on windows
+	// or on virtualized environments.
+	if pause == 0 {
+		t.Logf("last GC pause was 0")
+	} else if pause > 10e9 {
+		t.Logf("bad last GC pause: got %v, want [0, 10e9]", pause)
+	}
+}
+
+var hugeSink interface{}
+
+func TestHugeGCInfo(t *testing.T) {
+	// The test ensures that compiler can chew these huge types even on weakest machines.
+	// The types are not allocated at runtime.
+	if hugeSink != nil {
+		// 400MB on 32 bots, 4TB on 64-bits.
+		const n = (400 << 20) + (unsafe.Sizeof(uintptr(0))-4)<<40
+		hugeSink = new([n]*byte)
+		hugeSink = new([n]uintptr)
+		hugeSink = new(struct {
+			x float64
+			y [n]*byte
+			z []string
+		})
+		hugeSink = new(struct {
+			x float64
+			y [n]uintptr
+			z []string
+		})
+	}
+}
+
+func BenchmarkSetTypeNoPtr1(b *testing.B) {
+	type NoPtr1 struct {
+		p uintptr
+	}
+	var p *NoPtr1
+	for i := 0; i < b.N; i++ {
+		p = &NoPtr1{}
+	}
+	_ = p
+}
+func BenchmarkSetTypeNoPtr2(b *testing.B) {
+	type NoPtr2 struct {
+		p, q uintptr
+	}
+	var p *NoPtr2
+	for i := 0; i < b.N; i++ {
+		p = &NoPtr2{}
+	}
+	_ = p
+}
+func BenchmarkSetTypePtr1(b *testing.B) {
+	type Ptr1 struct {
+		p *byte
+	}
+	var p *Ptr1
+	for i := 0; i < b.N; i++ {
+		p = &Ptr1{}
+	}
+	_ = p
+}
+func BenchmarkSetTypePtr2(b *testing.B) {
+	type Ptr2 struct {
+		p, q *byte
+	}
+	var p *Ptr2
+	for i := 0; i < b.N; i++ {
+		p = &Ptr2{}
+	}
+	_ = p
+}
+
+func BenchmarkAllocation(b *testing.B) {
+	type T struct {
+		x, y *byte
+	}
+	ngo := runtime.GOMAXPROCS(0)
+	work := make(chan bool, b.N+ngo)
+	result := make(chan *T)
+	for i := 0; i < b.N; i++ {
+		work <- true
+	}
+	for i := 0; i < ngo; i++ {
+		work <- false
+	}
+	for i := 0; i < ngo; i++ {
+		go func() {
+			var x *T
+			for <-work {
+				for i := 0; i < 1000; i++ {
+					x = &T{}
+				}
+			}
+			result <- x
+		}()
+	}
+	for i := 0; i < ngo; i++ {
+		<-result
+	}
+}
+
+func TestPrintGC(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	done := make(chan bool)
+	go func() {
+		for {
+			select {
+			case <-done:
+				return
+			default:
+				runtime.GC()
+			}
+		}
+	}()
+	for i := 0; i < 1e4; i++ {
+		func() {
+			defer print("")
+		}()
+	}
+	close(done)
+}
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
new file mode 100644
index 000000000..88f6703f9
--- /dev/null
+++ b/src/runtime/gcinfo_test.go
@@ -0,0 +1,193 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"runtime"
+	"testing"
+)
+
+// TestGCInfo tests that various objects in heap, data and bss receive correct GC pointer type info.
+func TestGCInfo(t *testing.T) {
+	verifyGCInfo(t, "bss ScalarPtr", &bssScalarPtr, nonStackInfo(infoScalarPtr))
+	verifyGCInfo(t, "bss PtrScalar", &bssPtrScalar, nonStackInfo(infoPtrScalar))
+	verifyGCInfo(t, "bss BigStruct", &bssBigStruct, nonStackInfo(infoBigStruct()))
+	verifyGCInfo(t, "bss string", &bssString, nonStackInfo(infoString))
+	verifyGCInfo(t, "bss slice", &bssSlice, nonStackInfo(infoSlice))
+	verifyGCInfo(t, "bss eface", &bssEface, nonStackInfo(infoEface))
+	verifyGCInfo(t, "bss iface", &bssIface, nonStackInfo(infoIface))
+
+	verifyGCInfo(t, "data ScalarPtr", &dataScalarPtr, nonStackInfo(infoScalarPtr))
+	verifyGCInfo(t, "data PtrScalar", &dataPtrScalar, nonStackInfo(infoPtrScalar))
+	verifyGCInfo(t, "data BigStruct", &dataBigStruct, nonStackInfo(infoBigStruct()))
+	verifyGCInfo(t, "data string", &dataString, nonStackInfo(infoString))
+	verifyGCInfo(t, "data slice", &dataSlice, nonStackInfo(infoSlice))
+	verifyGCInfo(t, "data eface", &dataEface, nonStackInfo(infoEface))
+	verifyGCInfo(t, "data iface", &dataIface, nonStackInfo(infoIface))
+
+	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), infoScalarPtr)
+	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), infoPtrScalar)
+	verifyGCInfo(t, "stack BigStruct", new(BigStruct), infoBigStruct())
+	verifyGCInfo(t, "stack string", new(string), infoString)
+	verifyGCInfo(t, "stack slice", new([]string), infoSlice)
+	verifyGCInfo(t, "stack eface", new(interface{}), infoEface)
+	verifyGCInfo(t, "stack iface", new(Iface), infoIface)
+
+	for i := 0; i < 10; i++ {
+		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), nonStackInfo(infoScalarPtr))
+		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), nonStackInfo(infoPtrScalar))
+		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), nonStackInfo(infoBigStruct()))
+		verifyGCInfo(t, "heap string", escape(new(string)), nonStackInfo(infoString))
+		verifyGCInfo(t, "heap eface", escape(new(interface{})), nonStackInfo(infoEface))
+		verifyGCInfo(t, "heap iface", escape(new(Iface)), nonStackInfo(infoIface))
+	}
+
+}
+
+func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
+	mask := runtime.GCMask(p)
+	if len(mask) > len(mask0) {
+		mask0 = append(mask0, BitsDead)
+		mask = mask[:len(mask0)]
+	}
+	if bytes.Compare(mask, mask0) != 0 {
+		t.Errorf("bad GC program for %v:\nwant %+v\ngot  %+v", name, mask0, mask)
+		return
+	}
+}
+
+func nonStackInfo(mask []byte) []byte {
+	// BitsDead is replaced with BitsScalar everywhere except stacks.
+	mask1 := make([]byte, len(mask))
+	mw := false
+	for i, v := range mask {
+		if !mw && v == BitsDead {
+			v = BitsScalar
+		}
+		mw = !mw && v == BitsMultiWord
+		mask1[i] = v
+	}
+	return mask1
+}
+
+var gcinfoSink interface{}
+
+func escape(p interface{}) interface{} {
+	gcinfoSink = p
+	return p
+}
+
+const (
+	BitsDead = iota
+	BitsScalar
+	BitsPointer
+	BitsMultiWord
+)
+
+const (
+	BitsString = iota // unused
+	BitsSlice         // unused
+	BitsIface
+	BitsEface
+)
+
+type ScalarPtr struct {
+	q int
+	w *int
+	e int
+	r *int
+	t int
+	y *int
+}
+
+var infoScalarPtr = []byte{BitsScalar, BitsPointer, BitsScalar, BitsPointer, BitsScalar, BitsPointer}
+
+type PtrScalar struct {
+	q *int
+	w int
+	e *int
+	r int
+	t *int
+	y int
+}
+
+var infoPtrScalar = []byte{BitsPointer, BitsScalar, BitsPointer, BitsScalar, BitsPointer, BitsScalar}
+
+type BigStruct struct {
+	q *int
+	w byte
+	e [17]byte
+	r []byte
+	t int
+	y uint16
+	u uint64
+	i string
+}
+
+func infoBigStruct() []byte {
+	switch runtime.GOARCH {
+	case "386", "arm":
+		return []byte{
+			BitsPointer,                                                // q *int
+			BitsScalar, BitsScalar, BitsScalar, BitsScalar, BitsScalar, // w byte; e [17]byte
+			BitsPointer, BitsDead, BitsDead, // r []byte
+			BitsScalar, BitsScalar, BitsScalar, BitsScalar, // t int; y uint16; u uint64
+			BitsPointer, BitsDead, // i string
+		}
+	case "amd64":
+		return []byte{
+			BitsPointer,                        // q *int
+			BitsScalar, BitsScalar, BitsScalar, // w byte; e [17]byte
+			BitsPointer, BitsDead, BitsDead, // r []byte
+			BitsScalar, BitsScalar, BitsScalar, // t int; y uint16; u uint64
+			BitsPointer, BitsDead, // i string
+		}
+	case "amd64p32":
+		return []byte{
+			BitsPointer,                                                // q *int
+			BitsScalar, BitsScalar, BitsScalar, BitsScalar, BitsScalar, // w byte; e [17]byte
+			BitsPointer, BitsDead, BitsDead, // r []byte
+			BitsScalar, BitsScalar, BitsDead, BitsScalar, BitsScalar, // t int; y uint16; u uint64
+			BitsPointer, BitsDead, // i string
+		}
+	default:
+		panic("unknown arch")
+	}
+}
+
+type Iface interface {
+	f()
+}
+
+type IfaceImpl int
+
+func (IfaceImpl) f() {
+}
+
+var (
+	// BSS
+	bssScalarPtr ScalarPtr
+	bssPtrScalar PtrScalar
+	bssBigStruct BigStruct
+	bssString    string
+	bssSlice     []string
+	bssEface     interface{}
+	bssIface     Iface
+
+	// DATA
+	dataScalarPtr             = ScalarPtr{q: 1}
+	dataPtrScalar             = PtrScalar{w: 1}
+	dataBigStruct             = BigStruct{w: 1}
+	dataString                = "foo"
+	dataSlice                 = []string{"foo"}
+	dataEface     interface{} = 42
+	dataIface     Iface       = IfaceImpl(42)
+
+	infoString = []byte{BitsPointer, BitsDead}
+	infoSlice  = []byte{BitsPointer, BitsDead, BitsDead}
+	infoEface  = []byte{BitsMultiWord, BitsEface}
+	infoIface  = []byte{BitsMultiWord, BitsIface}
+)
diff --git a/src/runtime/hash_test.go b/src/runtime/hash_test.go
new file mode 100644
index 000000000..41fff98eb
--- /dev/null
+++ b/src/runtime/hash_test.go
@@ -0,0 +1,572 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	. "runtime"
+	"strings"
+	"testing"
+)
+
+// Smhasher is a torture test for hash functions.
+// https://code.google.com/p/smhasher/
+// This code is a port of some of the Smhasher tests to Go.
+//
+// The current AES hash function passes Smhasher.  Our fallback
+// hash functions don't, so we only enable the difficult tests when
+// we know the AES implementation is available.
+
+// Sanity checks.
+// hash should not depend on values outside key.
+// hash should not depend on alignment.
+func TestSmhasherSanity(t *testing.T) {
+	r := rand.New(rand.NewSource(1234))
+	const REP = 10
+	const KEYMAX = 128
+	const PAD = 16
+	const OFFMAX = 16
+	for k := 0; k < REP; k++ {
+		for n := 0; n < KEYMAX; n++ {
+			for i := 0; i < OFFMAX; i++ {
+				var b [KEYMAX + OFFMAX + 2*PAD]byte
+				var c [KEYMAX + OFFMAX + 2*PAD]byte
+				randBytes(r, b[:])
+				randBytes(r, c[:])
+				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
+				if BytesHash(b[PAD:PAD+n], 0) != BytesHash(c[PAD+i:PAD+i+n], 0) {
+					t.Errorf("hash depends on bytes outside key")
+				}
+			}
+		}
+	}
+}
+
+type HashSet struct {
+	m map[uintptr]struct{} // set of hashes added
+	n int                  // number of hashes added
+}
+
+func newHashSet() *HashSet {
+	return &HashSet{make(map[uintptr]struct{}), 0}
+}
+func (s *HashSet) add(h uintptr) {
+	s.m[h] = struct{}{}
+	s.n++
+}
+func (s *HashSet) addS(x string) {
+	s.add(StringHash(x, 0))
+}
+func (s *HashSet) addB(x []byte) {
+	s.add(BytesHash(x, 0))
+}
+func (s *HashSet) addS_seed(x string, seed uintptr) {
+	s.add(StringHash(x, seed))
+}
+func (s *HashSet) check(t *testing.T) {
+	const SLOP = 10.0
+	collisions := s.n - len(s.m)
+	//fmt.Printf("%d/%d\n", len(s.m), s.n)
+	pairs := int64(s.n) * int64(s.n-1) / 2
+	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
+	stddev := math.Sqrt(expected)
+	if float64(collisions) > expected+SLOP*3*stddev {
+		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
+	}
+}
+
+// a string plus adding zeros must make distinct hashes
+func TestSmhasherAppendedZeros(t *testing.T) {
+	s := "hello" + strings.Repeat("\x00", 256)
+	h := newHashSet()
+	for i := 0; i <= len(s); i++ {
+		h.addS(s[:i])
+	}
+	h.check(t)
+}
+
+// All 0-3 byte strings have distinct hashes.
+func TestSmhasherSmallKeys(t *testing.T) {
+	h := newHashSet()
+	var b [3]byte
+	for i := 0; i < 256; i++ {
+		b[0] = byte(i)
+		h.addB(b[:1])
+		for j := 0; j < 256; j++ {
+			b[1] = byte(j)
+			h.addB(b[:2])
+			if !testing.Short() {
+				for k := 0; k < 256; k++ {
+					b[2] = byte(k)
+					h.addB(b[:3])
+				}
+			}
+		}
+	}
+	h.check(t)
+}
+
+// Different length strings of all zeros have distinct hashes.
+func TestSmhasherZeros(t *testing.T) {
+	N := 256 * 1024
+	if testing.Short() {
+		N = 1024
+	}
+	h := newHashSet()
+	b := make([]byte, N)
+	for i := 0; i <= N; i++ {
+		h.addB(b[:i])
+	}
+	h.check(t)
+}
+
+// Strings with up to two nonzero bytes all have distinct hashes.
+func TestSmhasherTwoNonzero(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	h := newHashSet()
+	for n := 2; n <= 16; n++ {
+		twoNonZero(h, n)
+	}
+	h.check(t)
+}
+func twoNonZero(h *HashSet, n int) {
+	b := make([]byte, n)
+
+	// all zero
+	h.addB(b[:])
+
+	// one non-zero byte
+	for i := 0; i < n; i++ {
+		for x := 1; x < 256; x++ {
+			b[i] = byte(x)
+			h.addB(b[:])
+			b[i] = 0
+		}
+	}
+
+	// two non-zero bytes
+	for i := 0; i < n; i++ {
+		for x := 1; x < 256; x++ {
+			b[i] = byte(x)
+			for j := i + 1; j < n; j++ {
+				for y := 1; y < 256; y++ {
+					b[j] = byte(y)
+					h.addB(b[:])
+					b[j] = 0
+				}
+			}
+			b[i] = 0
+		}
+	}
+}
+
+// Test strings with repeats, like "abcdabcdabcdabcd..."
+func TestSmhasherCyclic(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	if !HaveGoodHash() {
+		t.Skip("fallback hash not good enough for this test")
+	}
+	r := rand.New(rand.NewSource(1234))
+	const REPEAT = 8
+	const N = 1000000
+	for n := 4; n <= 12; n++ {
+		h := newHashSet()
+		b := make([]byte, REPEAT*n)
+		for i := 0; i < N; i++ {
+			b[0] = byte(i * 79 % 97)
+			b[1] = byte(i * 43 % 137)
+			b[2] = byte(i * 151 % 197)
+			b[3] = byte(i * 199 % 251)
+			randBytes(r, b[4:n])
+			for j := n; j < n*REPEAT; j++ {
+				b[j] = b[j-n]
+			}
+			h.addB(b)
+		}
+		h.check(t)
+	}
+}
+
+// Test strings with only a few bits set
+func TestSmhasherSparse(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	sparse(t, 32, 6)
+	sparse(t, 40, 6)
+	sparse(t, 48, 5)
+	sparse(t, 56, 5)
+	sparse(t, 64, 5)
+	sparse(t, 96, 4)
+	sparse(t, 256, 3)
+	sparse(t, 2048, 2)
+}
+func sparse(t *testing.T, n int, k int) {
+	b := make([]byte, n/8)
+	h := newHashSet()
+	setbits(h, b, 0, k)
+	h.check(t)
+}
+
+// set up to k bits at index i and greater
+func setbits(h *HashSet, b []byte, i int, k int) {
+	h.addB(b)
+	if k == 0 {
+		return
+	}
+	for j := i; j < len(b)*8; j++ {
+		b[j/8] |= byte(1 << uint(j&7))
+		setbits(h, b, j+1, k-1)
+		b[j/8] &= byte(^(1 << uint(j&7)))
+	}
+}
+
+// Test all possible combinations of n blocks from the set s.
+// "permutation" is a bad name here, but it is what Smhasher uses.
+func TestSmhasherPermutation(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	if !HaveGoodHash() {
+		t.Skip("fallback hash not good enough for this test")
+	}
+	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
+	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
+	permutation(t, []uint32{0, 1}, 20)
+	permutation(t, []uint32{0, 1 << 31}, 20)
+	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
+}
+func permutation(t *testing.T, s []uint32, n int) {
+	b := make([]byte, n*4)
+	h := newHashSet()
+	genPerm(h, b, s, 0)
+	h.check(t)
+}
+func genPerm(h *HashSet, b []byte, s []uint32, n int) {
+	h.addB(b[:n])
+	if n == len(b) {
+		return
+	}
+	for _, v := range s {
+		b[n] = byte(v)
+		b[n+1] = byte(v >> 8)
+		b[n+2] = byte(v >> 16)
+		b[n+3] = byte(v >> 24)
+		genPerm(h, b, s, n+4)
+	}
+}
+
+type Key interface {
+	clear()              // set bits all to 0
+	random(r *rand.Rand) // set key to something random
+	bits() int           // how many bits key has
+	flipBit(i int)       // flip bit i of the key
+	hash() uintptr       // hash the key
+	name() string        // for error reporting
+}
+
+type BytesKey struct {
+	b []byte
+}
+
+func (k *BytesKey) clear() {
+	for i := range k.b {
+		k.b[i] = 0
+	}
+}
+func (k *BytesKey) random(r *rand.Rand) {
+	randBytes(r, k.b)
+}
+func (k *BytesKey) bits() int {
+	return len(k.b) * 8
+}
+func (k *BytesKey) flipBit(i int) {
+	k.b[i>>3] ^= byte(1 << uint(i&7))
+}
+func (k *BytesKey) hash() uintptr {
+	return BytesHash(k.b, 0)
+}
+func (k *BytesKey) name() string {
+	return fmt.Sprintf("bytes%d", len(k.b))
+}
+
+type Int32Key struct {
+	i uint32
+}
+
+func (k *Int32Key) clear() {
+	k.i = 0
+}
+func (k *Int32Key) random(r *rand.Rand) {
+	k.i = r.Uint32()
+}
+func (k *Int32Key) bits() int {
+	return 32
+}
+func (k *Int32Key) flipBit(i int) {
+	k.i ^= 1 << uint(i)
+}
+func (k *Int32Key) hash() uintptr {
+	return Int32Hash(k.i, 0)
+}
+func (k *Int32Key) name() string {
+	return "int32"
+}
+
+type Int64Key struct {
+	i uint64
+}
+
+func (k *Int64Key) clear() {
+	k.i = 0
+}
+func (k *Int64Key) random(r *rand.Rand) {
+	k.i = uint64(r.Uint32()) + uint64(r.Uint32())<<32
+}
+func (k *Int64Key) bits() int {
+	return 64
+}
+func (k *Int64Key) flipBit(i int) {
+	k.i ^= 1 << uint(i)
+}
+func (k *Int64Key) hash() uintptr {
+	return Int64Hash(k.i, 0)
+}
+func (k *Int64Key) name() string {
+	return "int64"
+}
+
+type EfaceKey struct {
+	i interface{}
+}
+
+func (k *EfaceKey) clear() {
+	k.i = nil
+}
+func (k *EfaceKey) random(r *rand.Rand) {
+	k.i = uint64(r.Int63())
+}
+func (k *EfaceKey) bits() int {
+	// use 64 bits.  This tests inlined interfaces
+	// on 64-bit targets and indirect interfaces on
+	// 32-bit targets.
+	return 64
+}
+func (k *EfaceKey) flipBit(i int) {
+	k.i = k.i.(uint64) ^ uint64(1)<<uint(i)
+}
+func (k *EfaceKey) hash() uintptr {
+	return EfaceHash(k.i, 0)
+}
+func (k *EfaceKey) name() string {
+	return "Eface"
+}
+
+type IfaceKey struct {
+	i interface {
+		F()
+	}
+}
+type fInter uint64
+
+func (x fInter) F() {
+}
+
+func (k *IfaceKey) clear() {
+	k.i = nil
+}
+func (k *IfaceKey) random(r *rand.Rand) {
+	k.i = fInter(r.Int63())
+}
+func (k *IfaceKey) bits() int {
+	// use 64 bits.  This tests inlined interfaces
+	// on 64-bit targets and indirect interfaces on
+	// 32-bit targets.
+	return 64
+}
+func (k *IfaceKey) flipBit(i int) {
+	k.i = k.i.(fInter) ^ fInter(1)<<uint(i)
+}
+func (k *IfaceKey) hash() uintptr {
+	return IfaceHash(k.i, 0)
+}
+func (k *IfaceKey) name() string {
+	return "Iface"
+}
+
+// Flipping a single bit of a key should flip each output bit with 50% probability.
+func TestSmhasherAvalanche(t *testing.T) {
+	if !HaveGoodHash() {
+		t.Skip("fallback hash not good enough for this test")
+	}
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	avalancheTest1(t, &BytesKey{make([]byte, 2)})
+	avalancheTest1(t, &BytesKey{make([]byte, 4)})
+	avalancheTest1(t, &BytesKey{make([]byte, 8)})
+	avalancheTest1(t, &BytesKey{make([]byte, 16)})
+	avalancheTest1(t, &BytesKey{make([]byte, 32)})
+	avalancheTest1(t, &BytesKey{make([]byte, 200)})
+	avalancheTest1(t, &Int32Key{})
+	avalancheTest1(t, &Int64Key{})
+	avalancheTest1(t, &EfaceKey{})
+	avalancheTest1(t, &IfaceKey{})
+}
+func avalancheTest1(t *testing.T, k Key) {
+	const REP = 100000
+	r := rand.New(rand.NewSource(1234))
+	n := k.bits()
+
+	// grid[i][j] is a count of whether flipping
+	// input bit i affects output bit j.
+	grid := make([][hashSize]int, n)
+
+	for z := 0; z < REP; z++ {
+		// pick a random key, hash it
+		k.random(r)
+		h := k.hash()
+
+		// flip each bit, hash & compare the results
+		for i := 0; i < n; i++ {
+			k.flipBit(i)
+			d := h ^ k.hash()
+			k.flipBit(i)
+
+			// record the effects of that bit flip
+			g := &grid[i]
+			for j := 0; j < hashSize; j++ {
+				g[j] += int(d & 1)
+				d >>= 1
+			}
+		}
+	}
+
+	// Each entry in the grid should be about REP/2.
+	// More precisely, we did N = k.bits() * hashSize experiments where
+	// each is the sum of REP coin flips.  We want to find bounds on the
+	// sum of coin flips such that a truly random experiment would have
+	// all sums inside those bounds with 99% probability.
+	N := n * hashSize
+	var c float64
+	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
+	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
+	}
+	c *= 4.0 // allowed slack - we don't need to be perfectly random
+	mean := .5 * REP
+	stddev := .5 * math.Sqrt(REP)
+	low := int(mean - c*stddev)
+	high := int(mean + c*stddev)
+	for i := 0; i < n; i++ {
+		for j := 0; j < hashSize; j++ {
+			x := grid[i][j]
+			if x < low || x > high {
+				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
+			}
+		}
+	}
+}
+
+// All bit rotations of a set of distinct keys
+func TestSmhasherWindowed(t *testing.T) {
+	windowed(t, &Int32Key{})
+	windowed(t, &Int64Key{})
+	windowed(t, &BytesKey{make([]byte, 128)})
+}
+func windowed(t *testing.T, k Key) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	const BITS = 16
+
+	for r := 0; r < k.bits(); r++ {
+		h := newHashSet()
+		for i := 0; i < 1<<BITS; i++ {
+			k.clear()
+			for j := 0; j < BITS; j++ {
+				if i>>uint(j)&1 != 0 {
+					k.flipBit((j + r) % k.bits())
+				}
+			}
+			h.add(k.hash())
+		}
+		h.check(t)
+	}
+}
+
+// All keys of the form prefix + [A-Za-z0-9]*N + suffix.
+func TestSmhasherText(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping in short mode")
+	}
+	text(t, "Foo", "Bar")
+	text(t, "FooBar", "")
+	text(t, "", "FooBar")
+}
+func text(t *testing.T, prefix, suffix string) {
+	const N = 4
+	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
+	const L = len(S)
+	b := make([]byte, len(prefix)+N+len(suffix))
+	copy(b, prefix)
+	copy(b[len(prefix)+N:], suffix)
+	h := newHashSet()
+	c := b[len(prefix):]
+	for i := 0; i < L; i++ {
+		c[0] = S[i]
+		for j := 0; j < L; j++ {
+			c[1] = S[j]
+			for k := 0; k < L; k++ {
+				c[2] = S[k]
+				for x := 0; x < L; x++ {
+					c[3] = S[x]
+					h.addB(b)
+				}
+			}
+		}
+	}
+	h.check(t)
+}
+
+// Make sure different seed values generate different hashes.
+func TestSmhasherSeed(t *testing.T) {
+	h := newHashSet()
+	const N = 100000
+	s := "hello"
+	for i := 0; i < N; i++ {
+		h.addS_seed(s, uintptr(i))
+	}
+	h.check(t)
+}
+
+// size of the hash output (32 or 64 bits)
+const hashSize = 32 + int(^uintptr(0)>>63<<5)
+
+func randBytes(r *rand.Rand, b []byte) {
+	for i := range b {
+		b[i] = byte(r.Uint32())
+	}
+}
+
+func benchmarkHash(b *testing.B, n int) {
+	s := strings.Repeat("A", n)
+
+	for i := 0; i < b.N; i++ {
+		StringHash(s, 0)
+	}
+	b.SetBytes(int64(n))
+}
+
+func BenchmarkHash5(b *testing.B)     { benchmarkHash(b, 5) }
+func BenchmarkHash16(b *testing.B)    { benchmarkHash(b, 16) }
+func BenchmarkHash64(b *testing.B)    { benchmarkHash(b, 64) }
+func BenchmarkHash1024(b *testing.B)  { benchmarkHash(b, 1024) }
+func BenchmarkHash65536(b *testing.B) { benchmarkHash(b, 65536) }
diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
new file mode 100644
index 000000000..b4e624423
--- /dev/null
+++ b/src/runtime/hashmap.go
@@ -0,0 +1,953 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// This file contains the implementation of Go's map type.
+//
+// A map is just a hash table.  The data is arranged
+// into an array of buckets.  Each bucket contains up to
+// 8 key/value pairs.  The low-order bits of the hash are
+// used to select a bucket.  Each bucket contains a few
+// high-order bits of each hash to distinguish the entries
+// within a single bucket.
+//
+// If more than 8 keys hash to a bucket, we chain on
+// extra buckets.
+//
+// When the hashtable grows, we allocate a new array
+// of buckets twice as big.  Buckets are incrementally
+// copied from the old bucket array to the new bucket array.
+//
+// Map iterators walk through the array of buckets and
+// return the keys in walk order (bucket #, then overflow
+// chain order, then bucket index).  To maintain iteration
+// semantics, we never move keys within their bucket (if
+// we did, keys might be returned 0 or 2 times).  When
+// growing the table, iterators remain iterating through the
+// old table and must check the new table if the bucket
+// they are iterating through has been moved ("evacuated")
+// to the new table.
+
+// Picking loadFactor: too large and we have lots of overflow
+// buckets, too small and we waste a lot of space.  I wrote
+// a simple program to check some stats for different loads:
+// (64-bit, 8 byte keys and values)
+//  loadFactor    %overflow  bytes/entry     hitprobe    missprobe
+//        4.00         2.13        20.77         3.00         4.00
+//        4.50         4.05        17.30         3.25         4.50
+//        5.00         6.85        14.77         3.50         5.00
+//        5.50        10.55        12.94         3.75         5.50
+//        6.00        15.27        11.67         4.00         6.00
+//        6.50        20.90        10.79         4.25         6.50
+//        7.00        27.14        10.15         4.50         7.00
+//        7.50        34.03         9.73         4.75         7.50
+//        8.00        41.10         9.40         5.00         8.00
+//
+// %overflow   = percentage of buckets which have an overflow bucket
+// bytes/entry = overhead bytes used per key/value pair
+// hitprobe    = # of entries to check when looking up a present key
+// missprobe   = # of entries to check when looking up an absent key
+//
+// Keep in mind this data is for maximally loaded tables, i.e. just
+// before the table grows.  Typical tables will be somewhat less loaded.
+
+import (
+	"unsafe"
+)
+
+const (
+	// Maximum number of key/value pairs a bucket can hold.
+	bucketCntBits = 3
+	bucketCnt     = 1 << bucketCntBits
+
+	// Maximum average load of a bucket that triggers growth.
+	loadFactor = 6.5
+
+	// Maximum key or value size to keep inline (instead of mallocing per element).
+	// Must fit in a uint8.
+	// Fast versions cannot handle big values - the cutoff size for
+	// fast versions in ../../cmd/gc/walk.c must be at most this value.
+	maxKeySize   = 128
+	maxValueSize = 128
+
+	// data offset should be the size of the bmap struct, but needs to be
+	// aligned correctly.  For amd64p32 this means 64-bit alignment
+	// even though pointers are 32 bit.
+	dataOffset = unsafe.Offsetof(struct {
+		b bmap
+		v int64
+	}{}.v)
+
+	// Possible tophash values.  We reserve a few possibilities for special marks.
+	// Each bucket (including its overflow buckets, if any) will have either all or none of its
+	// entries in the evacuated* states (except during the evacuate() method, which only happens
+	// during map writes and thus no one else can observe the map during that time).
+	empty          = 0 // cell is empty
+	evacuatedEmpty = 1 // cell is empty, bucket is evacuated.
+	evacuatedX     = 2 // key/value is valid.  Entry has been evacuated to first half of larger table.
+	evacuatedY     = 3 // same as above, but evacuated to second half of larger table.
+	minTopHash     = 4 // minimum tophash for a normal filled cell.
+
+	// flags
+	iterator    = 1 // there may be an iterator using buckets
+	oldIterator = 2 // there may be an iterator using oldbuckets
+
+	// sentinel bucket ID for iterator checks
+	noCheck = 1<<(8*ptrSize) - 1
+
+	// trigger a garbage collection at every alloc called from this code
+	checkgc = false
+)
+
+// A header for a Go map.
+type hmap struct {
+	// Note: the format of the Hmap is encoded in ../../cmd/gc/reflect.c and
+	// ../reflect/type.go.  Don't change this structure without also changing that code!
+	count int // # live cells == size of map.  Must be first (used by len() builtin)
+	flags uint32
+	hash0 uint32 // hash seed
+	B     uint8  // log_2 of # of buckets (can hold up to loadFactor * 2^B items)
+
+	buckets    unsafe.Pointer // array of 2^B Buckets. may be nil if count==0.
+	oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
+	nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)
+}
+
+// A bucket for a Go map.
+type bmap struct {
+	tophash  [bucketCnt]uint8
+	overflow *bmap
+	// Followed by bucketCnt keys and then bucketCnt values.
+	// NOTE: packing all the keys together and then all the values together makes the
+	// code a bit more complicated than alternating key/value/key/value/... but it allows
+	// us to eliminate padding which would be needed for, e.g., map[int64]int8.
+}
+
+// A hash iteration structure.
+// If you modify hiter, also change cmd/gc/reflect.c to indicate
+// the layout of this structure.
+type hiter struct {
+	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/gc/range.c).
+	value       unsafe.Pointer // Must be in second position (see cmd/gc/range.c).
+	t           *maptype
+	h           *hmap
+	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
+	bptr        *bmap          // current bucket
+	startBucket uintptr        // bucket iteration started at
+	offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
+	wrapped     bool           // already wrapped around from end of bucket array to beginning
+	B           uint8
+	i           uint8
+	bucket      uintptr
+	checkBucket uintptr
+}
+
+func evacuated(b *bmap) bool {
+	h := b.tophash[0]
+	return h > empty && h < minTopHash
+}
+
+func makemap(t *maptype, hint int64) *hmap {
+	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != uintptr(t.hmap.size) {
+		gothrow("bad hmap size")
+	}
+
+	if hint < 0 || int64(int32(hint)) != hint {
+		panic("makemap: size out of range")
+		// TODO: make hint an int, then none of this nonsense
+	}
+
+	if !ismapkey(t.key) {
+		gothrow("runtime.makemap: unsupported map key type")
+	}
+
+	// check compiler's and reflect's math
+	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(ptrSize)) ||
+		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
+		gothrow("key size wrong")
+	}
+	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(ptrSize)) ||
+		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
+		gothrow("value size wrong")
+	}
+
+	// invariants we depend on.  We should probably check these at compile time
+	// somewhere, but for now we'll do it here.
+	if t.key.align > bucketCnt {
+		gothrow("key align too big")
+	}
+	if t.elem.align > bucketCnt {
+		gothrow("value align too big")
+	}
+	if uintptr(t.key.size)%uintptr(t.key.align) != 0 {
+		gothrow("key size not a multiple of key align")
+	}
+	if uintptr(t.elem.size)%uintptr(t.elem.align) != 0 {
+		gothrow("value size not a multiple of value align")
+	}
+	if bucketCnt < 8 {
+		gothrow("bucketsize too small for proper alignment")
+	}
+	if dataOffset%uintptr(t.key.align) != 0 {
+		gothrow("need padding in bucket (key)")
+	}
+	if dataOffset%uintptr(t.elem.align) != 0 {
+		gothrow("need padding in bucket (value)")
+	}
+
+	// find size parameter which will hold the requested # of elements
+	B := uint8(0)
+	for ; hint > bucketCnt && float32(hint) > loadFactor*float32(uintptr(1)<<B); B++ {
+	}
+
+	// allocate initial hash table
+	// if B == 0, the buckets field is allocated lazily later (in mapassign)
+	// If hint is large zeroing this memory could take a while.
+	var buckets unsafe.Pointer
+	if B != 0 {
+		if checkgc {
+			memstats.next_gc = memstats.heap_alloc
+		}
+		buckets = newarray(t.bucket, uintptr(1)<<B)
+	}
+
+	// initialize Hmap
+	if checkgc {
+		memstats.next_gc = memstats.heap_alloc
+	}
+	h := (*hmap)(newobject(t.hmap))
+	h.count = 0
+	h.B = B
+	h.flags = 0
+	h.hash0 = fastrand1()
+	h.buckets = buckets
+	h.oldbuckets = nil
+	h.nevacuate = 0
+
+	return h
+}
+
+// mapaccess1 returns a pointer to h[key].  Never returns nil, instead
+// it will return a reference to the zero object for the value type if
+// the key is not in the map.
+// NOTE: The returned pointer may keep the whole map live, so don't
+// hold onto it for very long.
+func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		pc := funcPC(mapaccess1)
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	alg := goalg(t.key.alg)
+	hash := alg.hash(key, uintptr(t.key.size), uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if alg.equal(key, k, uintptr(t.key.size)) {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if t.indirectvalue {
+					v = *((*unsafe.Pointer)(v))
+				}
+				return v
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		pc := funcPC(mapaccess2)
+		racereadpc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	alg := goalg(t.key.alg)
+	hash := alg.hash(key, uintptr(t.key.size), uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(unsafe.Pointer(uintptr(c) + (hash&(m>>1))*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if alg.equal(key, k, uintptr(t.key.size)) {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if t.indirectvalue {
+					v = *((*unsafe.Pointer)(v))
+				}
+				return v, true
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}
+
+// returns both key and value.  Used by map iterator
+func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe.Pointer) {
+	if h == nil || h.count == 0 {
+		return nil, nil
+	}
+	alg := goalg(t.key.alg)
+	hash := alg.hash(key, uintptr(t.key.size), uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(unsafe.Pointer(uintptr(c) + (hash&(m>>1))*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			if t.indirectkey {
+				k = *((*unsafe.Pointer)(k))
+			}
+			if alg.equal(key, k, uintptr(t.key.size)) {
+				v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				if t.indirectvalue {
+					v = *((*unsafe.Pointer)(v))
+				}
+				return k, v
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return nil, nil
+		}
+	}
+}
+
+func mapassign1(t *maptype, h *hmap, key unsafe.Pointer, val unsafe.Pointer) {
+	if h == nil {
+		panic("assignment to entry in nil map")
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		pc := funcPC(mapassign1)
+		racewritepc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+		raceReadObjectPC(t.elem, val, callerpc, pc)
+	}
+
+	alg := goalg(t.key.alg)
+	hash := alg.hash(key, uintptr(t.key.size), uintptr(h.hash0))
+
+	if h.buckets == nil {
+		if checkgc {
+			memstats.next_gc = memstats.heap_alloc
+		}
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.oldbuckets != nil {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var insertv unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+					insertv = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			k2 := k
+			if t.indirectkey {
+				k2 = *((*unsafe.Pointer)(k2))
+			}
+			if !alg.equal(key, k2, uintptr(t.key.size)) {
+				continue
+			}
+			// already have a mapping for key.  Update it.
+			memmove(k2, key, uintptr(t.key.size))
+			v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+			v2 := v
+			if t.indirectvalue {
+				v2 = *((*unsafe.Pointer)(v2))
+			}
+			memmove(v2, val, uintptr(t.elem.size))
+			return
+		}
+		if b.overflow == nil {
+			break
+		}
+		b = b.overflow
+	}
+
+	// did not find mapping for key.  Allocate new cell & add entry.
+	if float32(h.count) >= loadFactor*float32((uintptr(1)<<h.B)) && h.count >= bucketCnt {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		if checkgc {
+			memstats.next_gc = memstats.heap_alloc
+		}
+		newb := (*bmap)(newobject(t.bucket))
+		b.overflow = newb
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		insertv = add(insertk, bucketCnt*uintptr(t.keysize))
+	}
+
+	// store new key/value at insert position
+	if t.indirectkey {
+		if checkgc {
+			memstats.next_gc = memstats.heap_alloc
+		}
+		kmem := newobject(t.key)
+		*(*unsafe.Pointer)(insertk) = kmem
+		insertk = kmem
+	}
+	if t.indirectvalue {
+		if checkgc {
+			memstats.next_gc = memstats.heap_alloc
+		}
+		vmem := newobject(t.elem)
+		*(*unsafe.Pointer)(insertv) = vmem
+		insertv = vmem
+	}
+	memmove(insertk, key, uintptr(t.key.size))
+	memmove(insertv, val, uintptr(t.elem.size))
+	*inserti = top
+	h.count++
+}
+
+func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		pc := funcPC(mapdelete)
+		racewritepc(unsafe.Pointer(h), callerpc, pc)
+		raceReadObjectPC(t.key, key, callerpc, pc)
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	alg := goalg(t.key.alg)
+	hash := alg.hash(key, uintptr(t.key.size), uintptr(h.hash0))
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.oldbuckets != nil {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+			k2 := k
+			if t.indirectkey {
+				k2 = *((*unsafe.Pointer)(k2))
+			}
+			if !alg.equal(key, k2, uintptr(t.key.size)) {
+				continue
+			}
+			memclr(k, uintptr(t.keysize))
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*uintptr(t.keysize) + i*uintptr(t.valuesize))
+			memclr(v, uintptr(t.valuesize))
+			b.tophash[i] = empty
+			h.count--
+			return
+		}
+		b = b.overflow
+		if b == nil {
+			return
+		}
+	}
+}
+
+func mapiterinit(t *maptype, h *hmap, it *hiter) {
+	// Clear pointer fields so garbage collector does not complain.
+	it.key = nil
+	it.value = nil
+	it.t = nil
+	it.h = nil
+	it.buckets = nil
+	it.bptr = nil
+
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiterinit))
+	}
+
+	if h == nil || h.count == 0 {
+		it.key = nil
+		it.value = nil
+		return
+	}
+
+	if unsafe.Sizeof(hiter{})/ptrSize != 10 {
+		gothrow("hash_iter size incorrect") // see ../../cmd/gc/reflect.c
+	}
+	it.t = t
+	it.h = h
+
+	// grab snapshot of bucket state
+	it.B = h.B
+	it.buckets = h.buckets
+
+	// decide where to start
+	r := uintptr(fastrand1())
+	if h.B > 31-bucketCntBits {
+		r += uintptr(fastrand1()) << 31
+	}
+	it.startBucket = r & (uintptr(1)<<h.B - 1)
+	it.offset = uint8(r >> h.B & (bucketCnt - 1))
+
+	// iterator state
+	it.bucket = it.startBucket
+	it.wrapped = false
+	it.bptr = nil
+
+	// Remember we have an iterator.
+	// Can run concurrently with another hash_iter_init().
+	for {
+		old := h.flags
+		if old == old|iterator|oldIterator {
+			break
+		}
+		if cas(&h.flags, old, old|iterator|oldIterator) {
+			break
+		}
+	}
+
+	mapiternext(it)
+}
+
+func mapiternext(it *hiter) {
+	h := it.h
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&it))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapiternext))
+	}
+	t := it.t
+	bucket := it.bucket
+	b := it.bptr
+	i := it.i
+	checkBucket := it.checkBucket
+	alg := goalg(t.key.alg)
+
+next:
+	if b == nil {
+		if bucket == it.startBucket && it.wrapped {
+			// end of iteration
+			it.key = nil
+			it.value = nil
+			return
+		}
+		if h.oldbuckets != nil && it.B == h.B {
+			// Iterator was started in the middle of a grow, and the grow isn't done yet.
+			// If the bucket we're looking at hasn't been filled in yet (i.e. the old
+			// bucket hasn't been evacuated) then we need to iterate through the old
+			// bucket and only return the ones that will be migrated to this bucket.
+			oldbucket := bucket & (uintptr(1)<<(it.B-1) - 1)
+			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+			if !evacuated(b) {
+				checkBucket = bucket
+			} else {
+				b = (*bmap)(add(it.buckets, bucket*uintptr(t.bucketsize)))
+				checkBucket = noCheck
+			}
+		} else {
+			b = (*bmap)(add(it.buckets, bucket*uintptr(t.bucketsize)))
+			checkBucket = noCheck
+		}
+		bucket++
+		if bucket == uintptr(1)<<it.B {
+			bucket = 0
+			it.wrapped = true
+		}
+		i = 0
+	}
+	for ; i < bucketCnt; i++ {
+		offi := (i + it.offset) & (bucketCnt - 1)
+		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
+		v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.valuesize))
+		if b.tophash[offi] != empty && b.tophash[offi] != evacuatedEmpty {
+			if checkBucket != noCheck {
+				// Special case: iterator was started during a grow and the
+				// grow is not done yet.  We're working on a bucket whose
+				// oldbucket has not been evacuated yet.  Or at least, it wasn't
+				// evacuated when we started the bucket.  So we're iterating
+				// through the oldbucket, skipping any keys that will go
+				// to the other new bucket (each oldbucket expands to two
+				// buckets during a grow).
+				k2 := k
+				if t.indirectkey {
+					k2 = *((*unsafe.Pointer)(k2))
+				}
+				if alg.equal(k2, k2, uintptr(t.key.size)) {
+					// If the item in the oldbucket is not destined for
+					// the current new bucket in the iteration, skip it.
+					hash := alg.hash(k2, uintptr(t.key.size), uintptr(h.hash0))
+					if hash&(uintptr(1)<<it.B-1) != checkBucket {
+						continue
+					}
+				} else {
+					// Hash isn't repeatable if k != k (NaNs).  We need a
+					// repeatable and randomish choice of which direction
+					// to send NaNs during evacuation.  We'll use the low
+					// bit of tophash to decide which way NaNs go.
+					// NOTE: this case is why we need two evacuate tophash
+					// values, evacuatedX and evacuatedY, that differ in
+					// their low bit.
+					if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
+						continue
+					}
+				}
+			}
+			if b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY {
+				// this is the golden data, we can return it.
+				if t.indirectkey {
+					k = *((*unsafe.Pointer)(k))
+				}
+				it.key = k
+				if t.indirectvalue {
+					v = *((*unsafe.Pointer)(v))
+				}
+				it.value = v
+			} else {
+				// The hash table has grown since the iterator was started.
+				// The golden data for this key is now somewhere else.
+				k2 := k
+				if t.indirectkey {
+					k2 = *((*unsafe.Pointer)(k2))
+				}
+				if alg.equal(k2, k2, uintptr(t.key.size)) {
+					// Check the current hash table for the data.
+					// This code handles the case where the key
+					// has been deleted, updated, or deleted and reinserted.
+					// NOTE: we need to regrab the key as it has potentially been
+					// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
+					rk, rv := mapaccessK(t, h, k2)
+					if rk == nil {
+						continue // key has been deleted
+					}
+					it.key = rk
+					it.value = rv
+				} else {
+					// if key!=key then the entry can't be deleted or
+					// updated, so we can just return it.  That's lucky for
+					// us because when key!=key we can't look it up
+					// successfully in the current table.
+					it.key = k2
+					if t.indirectvalue {
+						v = *((*unsafe.Pointer)(v))
+					}
+					it.value = v
+				}
+			}
+			it.bucket = bucket
+			it.bptr = b
+			it.i = i + 1
+			it.checkBucket = checkBucket
+			return
+		}
+	}
+	b = b.overflow
+	i = 0
+	goto next
+}
+
+func hashGrow(t *maptype, h *hmap) {
+	if h.oldbuckets != nil {
+		gothrow("evacuation not done in time")
+	}
+	oldbuckets := h.buckets
+	if checkgc {
+		memstats.next_gc = memstats.heap_alloc
+	}
+	newbuckets := newarray(t.bucket, uintptr(1)<<(h.B+1))
+	flags := h.flags &^ (iterator | oldIterator)
+	if h.flags&iterator != 0 {
+		flags |= oldIterator
+	}
+	// commit the grow (atomic wrt gc)
+	h.B++
+	h.flags = flags
+	h.oldbuckets = oldbuckets
+	h.buckets = newbuckets
+	h.nevacuate = 0
+
+	// the actual copying of the hash table data is done incrementally
+	// by growWork() and evacuate().
+}
+
+func growWork(t *maptype, h *hmap, bucket uintptr) {
+	noldbuckets := uintptr(1) << (h.B - 1)
+
+	// make sure we evacuate the oldbucket corresponding
+	// to the bucket we're about to use
+	evacuate(t, h, bucket&(noldbuckets-1))
+
+	// evacuate one more oldbucket to make progress on growing
+	if h.oldbuckets != nil {
+		evacuate(t, h, h.nevacuate)
+	}
+}
+
+func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
+	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+	newbit := uintptr(1) << (h.B - 1)
+	alg := goalg(t.key.alg)
+	if !evacuated(b) {
+		// TODO: reuse overflow buckets instead of using new ones, if there
+		// is no iterator using the old buckets.  (If !oldIterator.)
+
+		x := (*bmap)(add(h.buckets, oldbucket*uintptr(t.bucketsize)))
+		y := (*bmap)(add(h.buckets, (oldbucket+newbit)*uintptr(t.bucketsize)))
+		xi := 0
+		yi := 0
+		xk := add(unsafe.Pointer(x), dataOffset)
+		yk := add(unsafe.Pointer(y), dataOffset)
+		xv := add(xk, bucketCnt*uintptr(t.keysize))
+		yv := add(yk, bucketCnt*uintptr(t.keysize))
+		for ; b != nil; b = b.overflow {
+			k := add(unsafe.Pointer(b), dataOffset)
+			v := add(k, bucketCnt*uintptr(t.keysize))
+			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, uintptr(t.keysize)), add(v, uintptr(t.valuesize)) {
+				top := b.tophash[i]
+				if top == empty {
+					b.tophash[i] = evacuatedEmpty
+					continue
+				}
+				if top < minTopHash {
+					gothrow("bad map state")
+				}
+				k2 := k
+				if t.indirectkey {
+					k2 = *((*unsafe.Pointer)(k2))
+				}
+				// Compute hash to make our evacuation decision (whether we need
+				// to send this key/value to bucket x or bucket y).
+				hash := alg.hash(k2, uintptr(t.key.size), uintptr(h.hash0))
+				if h.flags&iterator != 0 {
+					if !alg.equal(k2, k2, uintptr(t.key.size)) {
+						// If key != key (NaNs), then the hash could be (and probably
+						// will be) entirely different from the old hash.  Moreover,
+						// it isn't reproducible.  Reproducibility is required in the
+						// presence of iterators, as our evacuation decision must
+						// match whatever decision the iterator made.
+						// Fortunately, we have the freedom to send these keys either
+						// way.  Also, tophash is meaningless for these kinds of keys.
+						// We let the low bit of tophash drive the evacuation decision.
+						// We recompute a new random tophash for the next level so
+						// these keys will get evenly distributed across all buckets
+						// after multiple grows.
+						if (top & 1) != 0 {
+							hash |= newbit
+						} else {
+							hash &^= newbit
+						}
+						top = uint8(hash >> (ptrSize*8 - 8))
+						if top < minTopHash {
+							top += minTopHash
+						}
+					}
+				}
+				if (hash & newbit) == 0 {
+					b.tophash[i] = evacuatedX
+					if xi == bucketCnt {
+						if checkgc {
+							memstats.next_gc = memstats.heap_alloc
+						}
+						newx := (*bmap)(newobject(t.bucket))
+						x.overflow = newx
+						x = newx
+						xi = 0
+						xk = add(unsafe.Pointer(x), dataOffset)
+						xv = add(xk, bucketCnt*uintptr(t.keysize))
+					}
+					x.tophash[xi] = top
+					if t.indirectkey {
+						*(*unsafe.Pointer)(xk) = k2 // copy pointer
+					} else {
+						memmove(xk, k, uintptr(t.key.size)) // copy value
+					}
+					if t.indirectvalue {
+						*(*unsafe.Pointer)(xv) = *(*unsafe.Pointer)(v)
+					} else {
+						memmove(xv, v, uintptr(t.elem.size))
+					}
+					xi++
+					xk = add(xk, uintptr(t.keysize))
+					xv = add(xv, uintptr(t.valuesize))
+				} else {
+					b.tophash[i] = evacuatedY
+					if yi == bucketCnt {
+						if checkgc {
+							memstats.next_gc = memstats.heap_alloc
+						}
+						newy := (*bmap)(newobject(t.bucket))
+						y.overflow = newy
+						y = newy
+						yi = 0
+						yk = add(unsafe.Pointer(y), dataOffset)
+						yv = add(yk, bucketCnt*uintptr(t.keysize))
+					}
+					y.tophash[yi] = top
+					if t.indirectkey {
+						*(*unsafe.Pointer)(yk) = k2
+					} else {
+						memmove(yk, k, uintptr(t.key.size))
+					}
+					if t.indirectvalue {
+						*(*unsafe.Pointer)(yv) = *(*unsafe.Pointer)(v)
+					} else {
+						memmove(yv, v, uintptr(t.elem.size))
+					}
+					yi++
+					yk = add(yk, uintptr(t.keysize))
+					yv = add(yv, uintptr(t.valuesize))
+				}
+			}
+		}
+		// Unlink the overflow buckets & clear key/value to help GC.
+		if h.flags&oldIterator == 0 {
+			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
+			b.overflow = nil
+			memclr(add(unsafe.Pointer(b), dataOffset), uintptr(t.bucketsize)-dataOffset)
+		}
+	}
+
+	// Advance evacuation mark
+	if oldbucket == h.nevacuate {
+		h.nevacuate = oldbucket + 1
+		if oldbucket+1 == newbit { // newbit == # of oldbuckets
+			// Growing is all done.  Free old main bucket array.
+			h.oldbuckets = nil
+		}
+	}
+}
+
+func ismapkey(t *_type) bool {
+	return goalg(t.alg).hash != nil
+}
+
+// Reflect stubs.  Called from ../reflect/asm_*.s
+
+func reflect_makemap(t *maptype) *hmap {
+	return makemap(t, 0)
+}
+
+func reflect_mapaccess(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
+	val, ok := mapaccess2(t, h, key)
+	if !ok {
+		// reflect wants nil for a missing element
+		val = nil
+	}
+	return val
+}
+
+func reflect_mapassign(t *maptype, h *hmap, key unsafe.Pointer, val unsafe.Pointer) {
+	mapassign1(t, h, key, val)
+}
+
+func reflect_mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
+	mapdelete(t, h, key)
+}
+
+func reflect_mapiterinit(t *maptype, h *hmap) *hiter {
+	it := new(hiter)
+	mapiterinit(t, h, it)
+	return it
+}
+
+func reflect_mapiternext(it *hiter) {
+	mapiternext(it)
+}
+
+func reflect_mapiterkey(it *hiter) unsafe.Pointer {
+	return it.key
+}
+
+func reflect_maplen(h *hmap) int {
+	if h == nil {
+		return 0
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&h))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(reflect_maplen))
+	}
+	return h.count
+}
+
+func reflect_ismapkey(t *_type) bool {
+	return ismapkey(t)
+}
diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go
new file mode 100644
index 000000000..8e21e02d6
--- /dev/null
+++ b/src/runtime/hashmap_fast.go
@@ -0,0 +1,379 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := goalg(t.key.alg).hash(noescape(unsafe.Pointer(&key)), 4, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := goalg(t.key.alg).hash(noescape(unsafe.Pointer(&key)), 4, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}
+
+func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := goalg(t.key.alg).hash(noescape(unsafe.Pointer(&key)), 8, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	var b *bmap
+	if h.B == 0 {
+		// One-bucket table.  No need to hash.
+		b = (*bmap)(h.buckets)
+	} else {
+		hash := goalg(t.key.alg).hash(noescape(unsafe.Pointer(&key)), 8, uintptr(h.hash0))
+		m := uintptr(1)<<h.B - 1
+		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+		if c := h.oldbuckets; c != nil {
+			oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+			if !evacuated(oldb) {
+				b = oldb
+			}
+		}
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x == empty {
+				continue
+			}
+			return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}
+
+func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero)
+	}
+	key := (*stringStruct)(unsafe.Pointer(&ky))
+	if h.B == 0 {
+		// One-bucket table.
+		b := (*bmap)(h.buckets)
+		if key.len < 32 {
+			// short key, doing lots of comparisons is ok
+			for i := uintptr(0); i < bucketCnt; i++ {
+				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				if x == empty {
+					continue
+				}
+				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+				if k.len != key.len {
+					continue
+				}
+				if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
+				}
+			}
+			return unsafe.Pointer(t.elem.zero)
+		}
+		// long key, try not to do more comparisons than necessary
+		keymaybe := uintptr(bucketCnt)
+		for i := uintptr(0); i < bucketCnt; i++ {
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x == empty {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
+			}
+			// check first 4 bytes
+			// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
+			// four 1-byte comparisons.
+			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
+				continue
+			}
+			// check last 4 bytes
+			if *((*[4]byte)(add(key.str, uintptr(key.len)-4))) != *((*[4]byte)(add(k.str, uintptr(key.len)-4))) {
+				continue
+			}
+			if keymaybe != bucketCnt {
+				// Two keys are potential matches.  Use hash to distinguish them.
+				goto dohash
+			}
+			keymaybe = i
+		}
+		if keymaybe != bucketCnt {
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*ptrSize))
+			if memeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(t.valuesize))
+			}
+		}
+		return unsafe.Pointer(t.elem.zero)
+	}
+dohash:
+	hash := goalg(t.key.alg).hash(noescape(unsafe.Pointer(&ky)), 2*ptrSize, uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x != top {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero)
+		}
+	}
+}
+
+func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return unsafe.Pointer(t.elem.zero), false
+	}
+	key := (*stringStruct)(unsafe.Pointer(&ky))
+	if h.B == 0 {
+		// One-bucket table.
+		b := (*bmap)(h.buckets)
+		if key.len < 32 {
+			// short key, doing lots of comparisons is ok
+			for i := uintptr(0); i < bucketCnt; i++ {
+				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				if x == empty {
+					continue
+				}
+				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+				if k.len != key.len {
+					continue
+				}
+				if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
+				}
+			}
+			return unsafe.Pointer(t.elem.zero), false
+		}
+		// long key, try not to do more comparisons than necessary
+		keymaybe := uintptr(bucketCnt)
+		for i := uintptr(0); i < bucketCnt; i++ {
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x == empty {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
+			}
+			// check first 4 bytes
+			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
+				continue
+			}
+			// check last 4 bytes
+			if *((*[4]byte)(add(key.str, uintptr(key.len)-4))) != *((*[4]byte)(add(k.str, uintptr(key.len)-4))) {
+				continue
+			}
+			if keymaybe != bucketCnt {
+				// Two keys are potential matches.  Use hash to distinguish them.
+				goto dohash
+			}
+			keymaybe = i
+		}
+		if keymaybe != bucketCnt {
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*ptrSize))
+			if memeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(t.valuesize)), true
+			}
+		}
+		return unsafe.Pointer(t.elem.zero), false
+	}
+dohash:
+	hash := goalg(t.key.alg).hash(noescape(unsafe.Pointer(&ky)), 2*ptrSize, uintptr(h.hash0))
+	m := uintptr(1)<<h.B - 1
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
+	if c := h.oldbuckets; c != nil {
+		oldb := (*bmap)(add(c, (hash&(m>>1))*uintptr(t.bucketsize)))
+		if !evacuated(oldb) {
+			b = oldb
+		}
+	}
+	top := uint8(hash >> (ptrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			if x != top {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
+			}
+		}
+		b = b.overflow
+		if b == nil {
+			return unsafe.Pointer(t.elem.zero), false
+		}
+	}
+}
diff --git a/src/runtime/heapdump.c b/src/runtime/heapdump.c
new file mode 100644
index 000000000..7eba8c005
--- /dev/null
+++ b/src/runtime/heapdump.c
@@ -0,0 +1,864 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of runtime/debug.WriteHeapDump.  Writes all
+// objects in the heap plus additional info (roots, threads,
+// finalizers, etc.) to a file.
+
+// The format of the dumped file is described at
+// http://golang.org/s/go14heapdump.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "mgc0.h"
+#include "type.h"
+#include "typekind.h"
+#include "funcdata.h"
+#include "zaexperiment.h"
+#include "textflag.h"
+
+extern byte runtime·data[];
+extern byte runtime·edata[];
+extern byte runtime·bss[];
+extern byte runtime·ebss[];
+
+enum {
+	FieldKindEol = 0,
+	FieldKindPtr = 1,
+	FieldKindIface = 2,
+	FieldKindEface = 3,
+
+	TagEOF = 0,
+	TagObject = 1,
+	TagOtherRoot = 2,
+	TagType = 3,
+	TagGoRoutine = 4,
+	TagStackFrame = 5,
+	TagParams = 6,
+	TagFinalizer = 7,
+	TagItab = 8,
+	TagOSThread = 9,
+	TagMemStats = 10,
+	TagQueuedFinalizer = 11,
+	TagData = 12,
+	TagBss = 13,
+	TagDefer = 14,
+	TagPanic = 15,
+	TagMemProf = 16,
+	TagAllocSample = 17,
+};
+
+static uintptr* playgcprog(uintptr offset, uintptr *prog, void (*callback)(void*,uintptr,uintptr), void *arg);
+static void dumpfields(BitVector bv);
+static void dumpbvtypes(BitVector *bv, byte *base);
+static BitVector makeheapobjbv(byte *p, uintptr size);
+
+// fd to write the dump to.
+static uintptr	dumpfd;
+
+#pragma dataflag NOPTR /* tmpbuf not a heap pointer at least */
+static byte	*tmpbuf;
+static uintptr	tmpbufsize;
+
+// buffer of pending write data
+enum {
+	BufSize = 4096,
+};
+#pragma dataflag NOPTR
+static byte buf[BufSize];
+static uintptr nbuf;
+
+static void
+write(byte *data, uintptr len)
+{
+	if(len + nbuf <= BufSize) {
+		runtime·memmove(buf + nbuf, data, len);
+		nbuf += len;
+		return;
+	}
+	runtime·write(dumpfd, buf, nbuf);
+	if(len >= BufSize) {
+		runtime·write(dumpfd, data, len);
+		nbuf = 0;
+	} else {
+		runtime·memmove(buf, data, len);
+		nbuf = len;
+	}
+}
+
+static void
+flush(void)
+{
+	runtime·write(dumpfd, buf, nbuf);
+	nbuf = 0;
+}
+
+// Cache of types that have been serialized already.
+// We use a type's hash field to pick a bucket.
+// Inside a bucket, we keep a list of types that
+// have been serialized so far, most recently used first.
+// Note: when a bucket overflows we may end up
+// serializing a type more than once.  That's ok.
+enum {
+	TypeCacheBuckets = 256, // must be a power of 2
+	TypeCacheAssoc = 4,
+};
+typedef struct TypeCacheBucket TypeCacheBucket;
+struct TypeCacheBucket {
+	Type *t[TypeCacheAssoc];
+};
+#pragma dataflag NOPTR /* only initialized and used while world is stopped */
+static TypeCacheBucket typecache[TypeCacheBuckets];
+
+// dump a uint64 in a varint format parseable by encoding/binary
+static void
+dumpint(uint64 v)
+{
+	byte buf[10];
+	int32 n;
+	n = 0;
+	while(v >= 0x80) {
+		buf[n++] = v | 0x80;
+		v >>= 7;
+	}
+	buf[n++] = v;
+	write(buf, n);
+}
+
+static void
+dumpbool(bool b)
+{
+	dumpint(b ? 1 : 0);
+}
+
+// dump varint uint64 length followed by memory contents
+static void
+dumpmemrange(byte *data, uintptr len)
+{
+	dumpint(len);
+	write(data, len);
+}
+
+static void
+dumpstr(String s)
+{
+	dumpmemrange(s.str, s.len);
+}
+
+static void
+dumpcstr(int8 *c)
+{
+	dumpmemrange((byte*)c, runtime·findnull((byte*)c));
+}
+
+// dump information for a type
+static void
+dumptype(Type *t)
+{
+	TypeCacheBucket *b;
+	int32 i, j;
+
+	if(t == nil) {
+		return;
+	}
+
+	// If we've definitely serialized the type before,
+	// no need to do it again.
+	b = &typecache[t->hash & (TypeCacheBuckets-1)];
+	if(t == b->t[0]) return;
+	for(i = 1; i < TypeCacheAssoc; i++) {
+		if(t == b->t[i]) {
+			// Move-to-front
+			for(j = i; j > 0; j--) {
+				b->t[j] = b->t[j-1];
+			}
+			b->t[0] = t;
+			return;
+		}
+	}
+	// Might not have been dumped yet.  Dump it and
+	// remember we did so.
+	for(j = TypeCacheAssoc-1; j > 0; j--) {
+		b->t[j] = b->t[j-1];
+	}
+	b->t[0] = t;
+	
+	// dump the type
+	dumpint(TagType);
+	dumpint((uintptr)t);
+	dumpint(t->size);
+	if(t->x == nil || t->x->pkgPath == nil || t->x->name == nil) {
+		dumpstr(*t->string);
+	} else {
+		dumpint(t->x->pkgPath->len + 1 + t->x->name->len);
+		write(t->x->pkgPath->str, t->x->pkgPath->len);
+		write((byte*)".", 1);
+		write(t->x->name->str, t->x->name->len);
+	}
+	dumpbool((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0);
+}
+
+// dump an object
+static void
+dumpobj(byte *obj, uintptr size, BitVector bv)
+{
+	dumpbvtypes(&bv, obj);
+	dumpint(TagObject);
+	dumpint((uintptr)obj);
+	dumpmemrange(obj, size);
+	dumpfields(bv);
+}
+
+static void
+dumpotherroot(int8 *description, byte *to)
+{
+	dumpint(TagOtherRoot);
+	dumpcstr(description);
+	dumpint((uintptr)to);
+}
+
+static void
+dumpfinalizer(byte *obj, FuncVal *fn, Type* fint, PtrType *ot)
+{
+	dumpint(TagFinalizer);
+	dumpint((uintptr)obj);
+	dumpint((uintptr)fn);
+	dumpint((uintptr)fn->fn);
+	dumpint((uintptr)fint);
+	dumpint((uintptr)ot);
+}
+
+typedef struct ChildInfo ChildInfo;
+struct ChildInfo {
+	// Information passed up from the callee frame about
+	// the layout of the outargs region.
+	uintptr argoff;     // where the arguments start in the frame
+	uintptr arglen;     // size of args region
+	BitVector args;    // if args.n >= 0, pointer map of args region
+
+	byte *sp;           // callee sp
+	uintptr depth;      // depth in call stack (0 == most recent)
+};
+
+// dump kinds & offsets of interesting fields in bv
+static void
+dumpbv(BitVector *bv, uintptr offset)
+{
+	uintptr i;
+
+	for(i = 0; i < bv->n; i += BitsPerPointer) {
+		switch(bv->bytedata[i/8] >> i%8 & 3) {
+		case BitsDead:
+			// BitsDead has already been processed in makeheapobjbv.
+			// We should only see it in stack maps, in which case we should continue processing.
+			break;
+		case BitsScalar:
+			break;
+		case BitsPointer:
+			dumpint(FieldKindPtr);
+			dumpint(offset + i / BitsPerPointer * PtrSize);
+			break;
+		case BitsMultiWord:
+			switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) {
+			default:
+				runtime·throw("unexpected garbage collection bits");
+			case BitsIface:
+				dumpint(FieldKindIface);
+				dumpint(offset + i / BitsPerPointer * PtrSize);
+				i += BitsPerPointer;
+				break;
+			case BitsEface:
+				dumpint(FieldKindEface);
+				dumpint(offset + i / BitsPerPointer * PtrSize);
+				i += BitsPerPointer;
+				break;
+			}
+		}
+	}
+}
+
+static bool
+dumpframe(Stkframe *s, void *arg)
+{
+	Func *f;
+	ChildInfo *child;
+	uintptr pc, off, size;
+	int32 pcdata;
+	StackMap *stackmap;
+	int8 *name;
+	BitVector bv;
+
+	child = (ChildInfo*)arg;
+	f = s->fn;
+
+	// Figure out what we can about our stack map
+	pc = s->pc;
+	if(pc != f->entry)
+		pc--;
+	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, pc);
+	if(pcdata == -1) {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function.  It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0;
+	}
+	stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
+
+	// Dump any types we will need to resolve Efaces.
+	if(child->args.n >= 0)
+		dumpbvtypes(&child->args, (byte*)s->sp + child->argoff);
+	if(stackmap != nil && stackmap->n > 0) {
+		bv = runtime·stackmapdata(stackmap, pcdata);
+		dumpbvtypes(&bv, (byte*)(s->varp - bv.n / BitsPerPointer * PtrSize));
+	} else {
+		bv.n = -1;
+	}
+
+	// Dump main body of stack frame.
+	dumpint(TagStackFrame);
+	dumpint(s->sp); // lowest address in frame
+	dumpint(child->depth); // # of frames deep on the stack
+	dumpint((uintptr)child->sp); // sp of child, or 0 if bottom of stack
+	dumpmemrange((byte*)s->sp, s->fp - s->sp);  // frame contents
+	dumpint(f->entry);
+	dumpint(s->pc);
+	dumpint(s->continpc);
+	name = runtime·funcname(f);
+	if(name == nil)
+		name = "unknown function";
+	dumpcstr(name);
+
+	// Dump fields in the outargs section
+	if(child->args.n >= 0) {
+		dumpbv(&child->args, child->argoff);
+	} else {
+		// conservative - everything might be a pointer
+		for(off = child->argoff; off < child->argoff + child->arglen; off += PtrSize) {
+			dumpint(FieldKindPtr);
+			dumpint(off);
+		}
+	}
+
+	// Dump fields in the local vars section
+	if(stackmap == nil) {
+		// No locals information, dump everything.
+		for(off = child->arglen; off < s->varp - s->sp; off += PtrSize) {
+			dumpint(FieldKindPtr);
+			dumpint(off);
+		}
+	} else if(stackmap->n < 0) {
+		// Locals size information, dump just the locals.
+		size = -stackmap->n;
+		for(off = s->varp - size - s->sp; off <  s->varp - s->sp; off += PtrSize) {
+			dumpint(FieldKindPtr);
+			dumpint(off);
+		}
+	} else if(stackmap->n > 0) {
+		// Locals bitmap information, scan just the pointers in
+		// locals.
+		dumpbv(&bv, s->varp - bv.n / BitsPerPointer * PtrSize - s->sp);
+	}
+	dumpint(FieldKindEol);
+
+	// Record arg info for parent.
+	child->argoff = s->argp - s->fp;
+	child->arglen = s->arglen;
+	child->sp = (byte*)s->sp;
+	child->depth++;
+	stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
+	if(stackmap != nil)
+		child->args = runtime·stackmapdata(stackmap, pcdata);
+	else
+		child->args.n = -1;
+	return true;
+}
+
+static void
+dumpgoroutine(G *gp)
+{
+	uintptr sp, pc, lr;
+	ChildInfo child;
+	Defer *d;
+	Panic *p;
+	bool (*fn)(Stkframe*, void*);
+
+	if(gp->syscallsp != (uintptr)nil) {
+		sp = gp->syscallsp;
+		pc = gp->syscallpc;
+		lr = 0;
+	} else {
+		sp = gp->sched.sp;
+		pc = gp->sched.pc;
+		lr = gp->sched.lr;
+	}
+
+	dumpint(TagGoRoutine);
+	dumpint((uintptr)gp);
+	dumpint((uintptr)sp);
+	dumpint(gp->goid);
+	dumpint(gp->gopc);
+	dumpint(runtime·readgstatus(gp));
+	dumpbool(gp->issystem);
+	dumpbool(false);  // isbackground
+	dumpint(gp->waitsince);
+	dumpstr(gp->waitreason);
+	dumpint((uintptr)gp->sched.ctxt);
+	dumpint((uintptr)gp->m);
+	dumpint((uintptr)gp->defer);
+	dumpint((uintptr)gp->panic);
+
+	// dump stack
+	child.args.n = -1;
+	child.arglen = 0;
+	child.sp = nil;
+	child.depth = 0;
+	fn = dumpframe;
+	runtime·gentraceback(pc, sp, lr, gp, 0, nil, 0x7fffffff, &fn, &child, 0);
+
+	// dump defer & panic records
+	for(d = gp->defer; d != nil; d = d->link) {
+		dumpint(TagDefer);
+		dumpint((uintptr)d);
+		dumpint((uintptr)gp);
+		dumpint((uintptr)d->argp);
+		dumpint((uintptr)d->pc);
+		dumpint((uintptr)d->fn);
+		dumpint((uintptr)d->fn->fn);
+		dumpint((uintptr)d->link);
+	}
+	for (p = gp->panic; p != nil; p = p->link) {
+		dumpint(TagPanic);
+		dumpint((uintptr)p);
+		dumpint((uintptr)gp);
+		dumpint((uintptr)p->arg.type);
+		dumpint((uintptr)p->arg.data);
+		dumpint(0); // was p->defer, no longer recorded
+		dumpint((uintptr)p->link);
+	}
+}
+
+static void
+dumpgs(void)
+{
+	G *gp;
+	uint32 i;
+	uint32 status;
+
+	// goroutines & stacks
+	for(i = 0; i < runtime·allglen; i++) {
+		gp = runtime·allg[i];
+		status = runtime·readgstatus(gp); // The world is stopped so gp will not be in a scan state.
+		switch(status){
+		default:
+			runtime·printf("runtime: unexpected G.status %d\n", status);
+			runtime·throw("dumpgs in STW - bad status");
+		case Gdead:
+			break;
+		case Grunnable:
+		case Gsyscall:
+		case Gwaiting:
+			dumpgoroutine(gp);
+			break;
+		}
+	}
+}
+
+static void
+finq_callback(FuncVal *fn, byte *obj, uintptr nret, Type *fint, PtrType *ot)
+{
+	dumpint(TagQueuedFinalizer);
+	dumpint((uintptr)obj);
+	dumpint((uintptr)fn);
+	dumpint((uintptr)fn->fn);
+	dumpint((uintptr)fint);
+	dumpint((uintptr)ot);
+	USED(&nret);
+}
+
+
+static void
+dumproots(void)
+{
+	MSpan *s, **allspans;
+	uint32 spanidx;
+	Special *sp;
+	SpecialFinalizer *spf;
+	byte *p;
+
+	// data segment
+	dumpbvtypes(&runtime·gcdatamask, runtime·data);
+	dumpint(TagData);
+	dumpint((uintptr)runtime·data);
+	dumpmemrange(runtime·data, runtime·edata - runtime·data);
+	dumpfields(runtime·gcdatamask);
+
+	// bss segment
+	dumpbvtypes(&runtime·gcbssmask, runtime·bss);
+	dumpint(TagBss);
+	dumpint((uintptr)runtime·bss);
+	dumpmemrange(runtime·bss, runtime·ebss - runtime·bss);
+	dumpfields(runtime·gcbssmask);
+
+	// MSpan.types
+	allspans = runtime·mheap.allspans;
+	for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
+		s = allspans[spanidx];
+		if(s->state == MSpanInUse) {
+			// Finalizers
+			for(sp = s->specials; sp != nil; sp = sp->next) {
+				if(sp->kind != KindSpecialFinalizer)
+					continue;
+				spf = (SpecialFinalizer*)sp;
+				p = (byte*)((s->start << PageShift) + spf->special.offset);
+				dumpfinalizer(p, spf->fn, spf->fint, spf->ot);
+			}
+		}
+	}
+
+	// Finalizer queue
+	runtime·iterate_finq(finq_callback);
+}
+
+// Bit vector of free marks.	
+// Needs to be as big as the largest number of objects per span.	
+#pragma dataflag NOPTR
+static byte free[PageSize/8];	
+
+static void
+dumpobjs(void)
+{
+	uintptr i, j, size, n;
+	MSpan *s;
+	MLink *l;
+	byte *p;
+
+	for(i = 0; i < runtime·mheap.nspan; i++) {
+		s = runtime·mheap.allspans[i];
+		if(s->state != MSpanInUse)
+			continue;
+		p = (byte*)(s->start << PageShift);
+		size = s->elemsize;
+		n = (s->npages << PageShift) / size;
+		if(n > nelem(free))	
+			runtime·throw("free array doesn't have enough entries");	
+		for(l = s->freelist; l != nil; l = l->next)
+			free[((byte*)l - p) / size] = true;	
+		for(j = 0; j < n; j++, p += size) {
+			if(free[j]) {	
+				free[j] = false;	
+				continue;	
+			}
+			dumpobj(p, size, makeheapobjbv(p, size));
+		}
+	}
+}
+
+static void
+dumpparams(void)
+{
+	byte *x;
+
+	dumpint(TagParams);
+	x = (byte*)1;
+	if(*(byte*)&x == 1)
+		dumpbool(false); // little-endian ptrs
+	else
+		dumpbool(true); // big-endian ptrs
+	dumpint(PtrSize);
+	dumpint((uintptr)runtime·mheap.arena_start);
+	dumpint((uintptr)runtime·mheap.arena_used);
+	dumpint(thechar);
+	dumpcstr(GOEXPERIMENT);
+	dumpint(runtime·ncpu);
+}
+
+static void
+itab_callback(Itab *tab)
+{
+	Type *t;
+
+	t = tab->type;
+	// Dump a map from itab* to the type of its data field.
+	// We want this map so we can deduce types of interface referents.
+	if((t->kind & KindDirectIface) == 0) {
+		// indirect - data slot is a pointer to t.
+		dumptype(t->ptrto);
+		dumpint(TagItab);
+		dumpint((uintptr)tab);
+		dumpint((uintptr)t->ptrto);
+	} else if((t->kind & KindNoPointers) == 0) {
+		// t is pointer-like - data slot is a t.
+		dumptype(t);
+		dumpint(TagItab);
+		dumpint((uintptr)tab);
+		dumpint((uintptr)t);
+	} else {
+		// Data slot is a scalar.  Dump type just for fun.
+		// With pointer-only interfaces, this shouldn't happen.
+		dumptype(t);
+		dumpint(TagItab);
+		dumpint((uintptr)tab);
+		dumpint((uintptr)t);
+	}
+}
+
+static void
+dumpitabs(void)
+{
+	void (*fn)(Itab*);
+	
+	fn = itab_callback;
+	runtime·iterate_itabs(&fn);
+}
+
+static void
+dumpms(void)
+{
+	M *mp;
+
+	for(mp = runtime·allm; mp != nil; mp = mp->alllink) {
+		dumpint(TagOSThread);
+		dumpint((uintptr)mp);
+		dumpint(mp->id);
+		dumpint(mp->procid);
+	}
+}
+
+static void
+dumpmemstats(void)
+{
+	int32 i;
+
+	dumpint(TagMemStats);
+	dumpint(mstats.alloc);
+	dumpint(mstats.total_alloc);
+	dumpint(mstats.sys);
+	dumpint(mstats.nlookup);
+	dumpint(mstats.nmalloc);
+	dumpint(mstats.nfree);
+	dumpint(mstats.heap_alloc);
+	dumpint(mstats.heap_sys);
+	dumpint(mstats.heap_idle);
+	dumpint(mstats.heap_inuse);
+	dumpint(mstats.heap_released);
+	dumpint(mstats.heap_objects);
+	dumpint(mstats.stacks_inuse);
+	dumpint(mstats.stacks_sys);
+	dumpint(mstats.mspan_inuse);
+	dumpint(mstats.mspan_sys);
+	dumpint(mstats.mcache_inuse);
+	dumpint(mstats.mcache_sys);
+	dumpint(mstats.buckhash_sys);
+	dumpint(mstats.gc_sys);
+	dumpint(mstats.other_sys);
+	dumpint(mstats.next_gc);
+	dumpint(mstats.last_gc);
+	dumpint(mstats.pause_total_ns);
+	for(i = 0; i < 256; i++)
+		dumpint(mstats.pause_ns[i]);
+	dumpint(mstats.numgc);
+}
+
+static void
+dumpmemprof_callback(Bucket *b, uintptr nstk, uintptr *stk, uintptr size, uintptr allocs, uintptr frees)
+{
+	uintptr i, pc;
+	Func *f;
+	byte buf[20];
+	String file;
+	int32 line;
+
+	dumpint(TagMemProf);
+	dumpint((uintptr)b);
+	dumpint(size);
+	dumpint(nstk);
+	for(i = 0; i < nstk; i++) {
+		pc = stk[i];
+		f = runtime·findfunc(pc);
+		if(f == nil) {
+			runtime·snprintf(buf, sizeof(buf), "%X", (uint64)pc);
+			dumpcstr((int8*)buf);
+			dumpcstr("?");
+			dumpint(0);
+		} else {
+			dumpcstr(runtime·funcname(f));
+			// TODO: Why do we need to back up to a call instruction here?
+			// Maybe profiler should do this.
+			if(i > 0 && pc > f->entry) {
+				if(thechar == '6' || thechar == '8')
+					pc--;
+				else
+					pc -= 4; // arm, etc
+			}
+			line = runtime·funcline(f, pc, &file);
+			dumpstr(file);
+			dumpint(line);
+		}
+	}
+	dumpint(allocs);
+	dumpint(frees);
+}
+
+static void
+dumpmemprof(void)
+{
+	MSpan *s, **allspans;
+	uint32 spanidx;
+	Special *sp;
+	SpecialProfile *spp;
+	byte *p;
+	void (*fn)(Bucket*, uintptr, uintptr*, uintptr, uintptr, uintptr);
+	
+	fn = dumpmemprof_callback;
+	runtime·iterate_memprof(&fn);
+
+	allspans = runtime·mheap.allspans;
+	for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
+		s = allspans[spanidx];
+		if(s->state != MSpanInUse)
+			continue;
+		for(sp = s->specials; sp != nil; sp = sp->next) {
+			if(sp->kind != KindSpecialProfile)
+				continue;
+			spp = (SpecialProfile*)sp;
+			p = (byte*)((s->start << PageShift) + spp->special.offset);
+			dumpint(TagAllocSample);
+			dumpint((uintptr)p);
+			dumpint((uintptr)spp->b);
+		}
+	}
+}
+
+static void
+mdump(void)
+{
+	byte *hdr;
+	uintptr i;
+	MSpan *s;
+
+	// make sure we're done sweeping
+	for(i = 0; i < runtime·mheap.nspan; i++) {
+		s = runtime·mheap.allspans[i];
+		if(s->state == MSpanInUse)
+			runtime·MSpan_EnsureSwept(s);
+	}
+
+	runtime·memclr((byte*)&typecache[0], sizeof(typecache));
+	hdr = (byte*)"go1.4 heap dump\n";
+	write(hdr, runtime·findnull(hdr));
+	dumpparams();
+	dumpitabs();
+	dumpobjs();
+	dumpgs();
+	dumpms();
+	dumproots();
+	dumpmemstats();
+	dumpmemprof();
+	dumpint(TagEOF);
+	flush();
+}
+
+void
+runtime·writeheapdump_m(void)
+{
+	uintptr fd;
+	
+	fd = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+
+	runtime·casgstatus(g->m->curg, Grunning, Gwaiting);
+	g->waitreason = runtime·gostringnocopy((byte*)"dumping heap");
+
+	// Update stats so we can dump them.
+	// As a side effect, flushes all the MCaches so the MSpan.freelist
+	// lists contain all the free objects.
+	runtime·updatememstats(nil);
+
+	// Set dump file.
+	dumpfd = fd;
+
+	// Call dump routine.
+	mdump();
+
+	// Reset dump file.
+	dumpfd = 0;
+	if(tmpbuf != nil) {
+		runtime·SysFree(tmpbuf, tmpbufsize, &mstats.other_sys);
+		tmpbuf = nil;
+		tmpbufsize = 0;
+	}
+
+	runtime·casgstatus(g->m->curg, Gwaiting, Grunning);
+}
+
+// dumpint() the kind & offset of each field in an object.
+static void
+dumpfields(BitVector bv)
+{
+	dumpbv(&bv, 0);
+	dumpint(FieldKindEol);
+}
+
+// The heap dump reader needs to be able to disambiguate
+// Eface entries.  So it needs to know every type that might
+// appear in such an entry.  The following routine accomplishes that.
+
+// Dump all the types that appear in the type field of
+// any Eface described by this bit vector.
+static void
+dumpbvtypes(BitVector *bv, byte *base)
+{
+	uintptr i;
+
+	for(i = 0; i < bv->n; i += BitsPerPointer) {
+		if((bv->bytedata[i/8] >> i%8 & 3) != BitsMultiWord)
+			continue;
+		switch(bv->bytedata[(i+BitsPerPointer)/8] >> (i+BitsPerPointer)%8 & 3) {
+		default:
+			runtime·throw("unexpected garbage collection bits");
+		case BitsIface:
+			i += BitsPerPointer;
+			break;
+		case BitsEface:
+			dumptype(*(Type**)(base + i / BitsPerPointer * PtrSize));
+			i += BitsPerPointer;
+			break;
+		}
+	}
+}
+
+static BitVector
+makeheapobjbv(byte *p, uintptr size)
+{
+	uintptr off, nptr, i;
+	byte shift, *bitp, bits;
+	bool mw;
+
+	// Extend the temp buffer if necessary.
+	nptr = size/PtrSize;
+	if(tmpbufsize < nptr*BitsPerPointer/8+1) {
+		if(tmpbuf != nil)
+			runtime·SysFree(tmpbuf, tmpbufsize, &mstats.other_sys);
+		tmpbufsize = nptr*BitsPerPointer/8+1;
+		tmpbuf = runtime·sysAlloc(tmpbufsize, &mstats.other_sys);
+		if(tmpbuf == nil)
+			runtime·throw("heapdump: out of memory");
+	}
+
+	// Copy and compact the bitmap.
+	mw = false;
+	for(i = 0; i < nptr; i++) {
+		off = (uintptr*)(p + i*PtrSize) - (uintptr*)runtime·mheap.arena_start;
+		bitp = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
+		bits = (*bitp >> (shift + 2)) & BitsMask;
+		if(!mw && bits == BitsDead)
+			break;  // end of heap object
+		mw = !mw && bits == BitsMultiWord;
+		tmpbuf[i*BitsPerPointer/8] &= ~(BitsMask<<((i*BitsPerPointer)%8));
+		tmpbuf[i*BitsPerPointer/8] |= bits<<((i*BitsPerPointer)%8);
+	}
+	return (BitVector){i*BitsPerPointer, tmpbuf};
+}
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
new file mode 100644
index 000000000..f60b6a79c
--- /dev/null
+++ b/src/runtime/iface.go
@@ -0,0 +1,439 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const (
+	hashSize = 1009
+)
+
+var (
+	ifaceLock mutex // lock for accessing hash
+	hash      [hashSize]*itab
+)
+
+// fInterface is our standard non-empty interface.  We use it instead
+// of interface{f()} in function prototypes because gofmt insists on
+// putting lots of newlines in the otherwise concise interface{f()}.
+type fInterface interface {
+	f()
+}
+
+func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
+	if len(inter.mhdr) == 0 {
+		gothrow("internal error - misuse of itab")
+	}
+
+	// easy case
+	x := typ.x
+	if x == nil {
+		if canfail {
+			return nil
+		}
+		i := (*imethod)(add(unsafe.Pointer(inter), unsafe.Sizeof(interfacetype{})))
+		panic(&TypeAssertionError{"", *typ._string, *inter.typ._string, *i.name})
+	}
+
+	// compiler has provided some good hash codes for us.
+	h := inter.typ.hash
+	h += 17 * typ.hash
+	// TODO(rsc): h += 23 * x.mhash ?
+	h %= hashSize
+
+	// look twice - once without lock, once with.
+	// common case will be no lock contention.
+	var m *itab
+	var locked int
+	for locked = 0; locked < 2; locked++ {
+		if locked != 0 {
+			lock(&ifaceLock)
+		}
+		for m = (*itab)(atomicloadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
+			if m.inter == inter && m._type == typ {
+				if m.bad != 0 {
+					m = nil
+					if !canfail {
+						// this can only happen if the conversion
+						// was already done once using the , ok form
+						// and we have a cached negative result.
+						// the cached result doesn't record which
+						// interface function was missing, so jump
+						// down to the interface check, which will
+						// do more work but give a better error.
+						goto search
+					}
+				}
+				if locked != 0 {
+					unlock(&ifaceLock)
+				}
+				return m
+			}
+		}
+	}
+
+	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr))*ptrSize, 0, &memstats.other_sys))
+	m.inter = inter
+	m._type = typ
+
+search:
+	// both inter and typ have method sorted by name,
+	// and interface names are unique,
+	// so can iterate over both in lock step;
+	// the loop is O(ni+nt) not O(ni*nt).
+	ni := len(inter.mhdr)
+	nt := len(x.mhdr)
+	j := 0
+	for k := 0; k < ni; k++ {
+		i := (*imethod)(add(unsafe.Pointer(inter), unsafe.Sizeof(interfacetype{})+uintptr(k)*unsafe.Sizeof(imethod{})))
+		iname := i.name
+		ipkgpath := i.pkgpath
+		itype := i._type
+		for ; j < nt; j++ {
+			t := (*method)(add(unsafe.Pointer(x), unsafe.Sizeof(uncommontype{})+uintptr(j)*unsafe.Sizeof(method{})))
+			if t.mtyp == itype && t.name == iname && t.pkgpath == ipkgpath {
+				if m != nil {
+					*(*unsafe.Pointer)(add(unsafe.Pointer(m), unsafe.Sizeof(itab{})+uintptr(k)*ptrSize)) = t.ifn
+				}
+				goto nextimethod
+			}
+		}
+		// didn't find method
+		if !canfail {
+			if locked != 0 {
+				unlock(&ifaceLock)
+			}
+			panic(&TypeAssertionError{"", *typ._string, *inter.typ._string, *iname})
+		}
+		m.bad = 1
+		break
+	nextimethod:
+	}
+	if locked == 0 {
+		gothrow("invalid itab locking")
+	}
+	m.link = hash[h]
+	atomicstorep(unsafe.Pointer(&hash[h]), unsafe.Pointer(m))
+	unlock(&ifaceLock)
+	if m.bad != 0 {
+		return nil
+	}
+	return m
+}
+
+func typ2Itab(t *_type, inter *interfacetype, cache **itab) *itab {
+	tab := getitab(inter, t, false)
+	atomicstorep(unsafe.Pointer(cache), unsafe.Pointer(tab))
+	return tab
+}
+
+func convT2E(t *_type, elem unsafe.Pointer) (e interface{}) {
+	size := uintptr(t.size)
+	ep := (*eface)(unsafe.Pointer(&e))
+	if isDirectIface(t) {
+		ep._type = t
+		memmove(unsafe.Pointer(&ep.data), elem, size)
+	} else {
+		x := newobject(t)
+		// TODO: We allocate a zeroed object only to overwrite it with
+		// actual data.  Figure out how to avoid zeroing.  Also below in convT2I.
+		memmove(x, elem, size)
+		ep._type = t
+		ep.data = x
+	}
+	return
+}
+
+func convT2I(t *_type, inter *interfacetype, cache **itab, elem unsafe.Pointer) (i fInterface) {
+	tab := (*itab)(atomicloadp(unsafe.Pointer(cache)))
+	if tab == nil {
+		tab = getitab(inter, t, false)
+		atomicstorep(unsafe.Pointer(cache), unsafe.Pointer(tab))
+	}
+	size := uintptr(t.size)
+	pi := (*iface)(unsafe.Pointer(&i))
+	if isDirectIface(t) {
+		pi.tab = tab
+		memmove(unsafe.Pointer(&pi.data), elem, size)
+	} else {
+		x := newobject(t)
+		memmove(x, elem, size)
+		pi.tab = tab
+		pi.data = x
+	}
+	return
+}
+
+// TODO: give these routines a pointer to the result area instead of writing
+// extra data in the outargs section.  Then we can get rid of go:nosplit.
+//go:nosplit
+func assertI2T(t *_type, i fInterface) (r struct{}) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		panic(&TypeAssertionError{"", "", *t._string, ""})
+	}
+	if tab._type != t {
+		panic(&TypeAssertionError{*tab.inter.typ._string, *tab._type._string, *t._string, ""})
+	}
+	size := uintptr(t.size)
+	if isDirectIface(t) {
+		memmove(unsafe.Pointer(&r), unsafe.Pointer(&ip.data), size)
+	} else {
+		memmove(unsafe.Pointer(&r), ip.data, size)
+	}
+	return
+}
+
+//go:nosplit
+func assertI2T2(t *_type, i fInterface) (r byte) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	size := uintptr(t.size)
+	ok := (*bool)(add(unsafe.Pointer(&r), size))
+	tab := ip.tab
+	if tab == nil || tab._type != t {
+		*ok = false
+		memclr(unsafe.Pointer(&r), size)
+		return
+	}
+	*ok = true
+	if isDirectIface(t) {
+		memmove(unsafe.Pointer(&r), unsafe.Pointer(&ip.data), size)
+	} else {
+		memmove(unsafe.Pointer(&r), ip.data, size)
+	}
+	return
+}
+
+func assertI2TOK(t *_type, i fInterface) bool {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	return tab != nil && tab._type == t
+}
+
+//go:nosplit
+func assertE2T(t *_type, e interface{}) (r struct{}) {
+	ep := (*eface)(unsafe.Pointer(&e))
+	if ep._type == nil {
+		panic(&TypeAssertionError{"", "", *t._string, ""})
+	}
+	if ep._type != t {
+		panic(&TypeAssertionError{"", *ep._type._string, *t._string, ""})
+	}
+	size := uintptr(t.size)
+	if isDirectIface(t) {
+		memmove(unsafe.Pointer(&r), unsafe.Pointer(&ep.data), size)
+	} else {
+		memmove(unsafe.Pointer(&r), ep.data, size)
+	}
+	return
+}
+
+//go:nosplit
+func assertE2T2(t *_type, e interface{}) (r byte) {
+	ep := (*eface)(unsafe.Pointer(&e))
+	size := uintptr(t.size)
+	ok := (*bool)(add(unsafe.Pointer(&r), size))
+	if ep._type != t {
+		*ok = false
+		memclr(unsafe.Pointer(&r), size)
+		return
+	}
+	*ok = true
+	if isDirectIface(t) {
+		memmove(unsafe.Pointer(&r), unsafe.Pointer(&ep.data), size)
+	} else {
+		memmove(unsafe.Pointer(&r), ep.data, size)
+	}
+	return
+}
+
+func assertE2TOK(t *_type, e interface{}) bool {
+	ep := (*eface)(unsafe.Pointer(&e))
+	return t == ep._type
+}
+
+func convI2E(i fInterface) (r interface{}) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		return
+	}
+	rp := (*eface)(unsafe.Pointer(&r))
+	rp._type = tab._type
+	rp.data = ip.data
+	return
+}
+
+func assertI2E(inter *interfacetype, i fInterface) (r interface{}) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		// explicit conversions require non-nil interface value.
+		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+	}
+	rp := (*eface)(unsafe.Pointer(&r))
+	rp._type = tab._type
+	rp.data = ip.data
+	return
+}
+
+func assertI2E2(inter *interfacetype, i fInterface) (r interface{}, ok bool) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		return
+	}
+	rp := (*eface)(unsafe.Pointer(&r))
+	rp._type = tab._type
+	rp.data = ip.data
+	ok = true
+	return
+}
+
+func convI2I(inter *interfacetype, i fInterface) (r fInterface) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		return
+	}
+	rp := (*iface)(unsafe.Pointer(&r))
+	if tab.inter == inter {
+		rp.tab = tab
+		rp.data = ip.data
+		return
+	}
+	rp.tab = getitab(inter, tab._type, false)
+	rp.data = ip.data
+	return
+}
+
+func assertI2I(inter *interfacetype, i fInterface) (r fInterface) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		// explicit conversions require non-nil interface value.
+		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+	}
+	rp := (*iface)(unsafe.Pointer(&r))
+	if tab.inter == inter {
+		rp.tab = tab
+		rp.data = ip.data
+		return
+	}
+	rp.tab = getitab(inter, tab._type, false)
+	rp.data = ip.data
+	return
+}
+
+func assertI2I2(inter *interfacetype, i fInterface) (r fInterface, ok bool) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		return
+	}
+	rp := (*iface)(unsafe.Pointer(&r))
+	if tab.inter == inter {
+		rp.tab = tab
+		rp.data = ip.data
+		ok = true
+		return
+	}
+	tab = getitab(inter, tab._type, true)
+	if tab == nil {
+		rp.data = nil
+		rp.tab = nil
+		ok = false
+		return
+	}
+	rp.tab = tab
+	rp.data = ip.data
+	ok = true
+	return
+}
+
+func assertE2I(inter *interfacetype, e interface{}) (r fInterface) {
+	ep := (*eface)(unsafe.Pointer(&e))
+	t := ep._type
+	if t == nil {
+		// explicit conversions require non-nil interface value.
+		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+	}
+	rp := (*iface)(unsafe.Pointer(&r))
+	rp.tab = getitab(inter, t, false)
+	rp.data = ep.data
+	return
+}
+
+func assertE2I2(inter *interfacetype, e interface{}) (r fInterface, ok bool) {
+	ep := (*eface)(unsafe.Pointer(&e))
+	t := ep._type
+	if t == nil {
+		return
+	}
+	tab := getitab(inter, t, true)
+	if tab == nil {
+		return
+	}
+	rp := (*iface)(unsafe.Pointer(&r))
+	rp.tab = tab
+	rp.data = ep.data
+	ok = true
+	return
+}
+
+func reflect_ifaceE2I(inter *interfacetype, e interface{}, dst *fInterface) {
+	*dst = assertE2I(inter, e)
+}
+
+func assertE2E(inter *interfacetype, e interface{}) interface{} {
+	ep := (*eface)(unsafe.Pointer(&e))
+	if ep._type == nil {
+		// explicit conversions require non-nil interface value.
+		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+	}
+	return e
+}
+
+func assertE2E2(inter *interfacetype, e interface{}) (interface{}, bool) {
+	ep := (*eface)(unsafe.Pointer(&e))
+	if ep._type == nil {
+		return nil, false
+	}
+	return e, true
+}
+
+func ifacethash(i fInterface) uint32 {
+	ip := (*iface)(unsafe.Pointer(&i))
+	tab := ip.tab
+	if tab == nil {
+		return 0
+	}
+	return tab._type.hash
+}
+
+func efacethash(e interface{}) uint32 {
+	ep := (*eface)(unsafe.Pointer(&e))
+	t := ep._type
+	if t == nil {
+		return 0
+	}
+	return t.hash
+}
+
+func iterate_itabs(fn func(*itab)) {
+	for _, h := range &hash {
+		for ; h != nil; h = h.link {
+			fn(h)
+		}
+	}
+}
+
+func ifaceE2I2(inter *interfacetype, e interface{}, r *fInterface) (ok bool) {
+	*r, ok = assertE2I2(inter, e)
+	return
+}
diff --git a/src/runtime/iface_test.go b/src/runtime/iface_test.go
new file mode 100644
index 000000000..bca0ea0ee
--- /dev/null
+++ b/src/runtime/iface_test.go
@@ -0,0 +1,138 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"testing"
+)
+
+type I1 interface {
+	Method1()
+}
+
+type I2 interface {
+	Method1()
+	Method2()
+}
+
+type TS uint16
+type TM uintptr
+type TL [2]uintptr
+
+func (TS) Method1() {}
+func (TS) Method2() {}
+func (TM) Method1() {}
+func (TM) Method2() {}
+func (TL) Method1() {}
+func (TL) Method2() {}
+
+var (
+	e  interface{}
+	e_ interface{}
+	i1 I1
+	i2 I2
+	ts TS
+	tm TM
+	tl TL
+)
+
+func BenchmarkConvT2ESmall(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		e = ts
+	}
+}
+
+func BenchmarkConvT2EUintptr(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		e = tm
+	}
+}
+
+func BenchmarkConvT2ELarge(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		e = tl
+	}
+}
+
+func BenchmarkConvT2ISmall(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i1 = ts
+	}
+}
+
+func BenchmarkConvT2IUintptr(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i1 = tm
+	}
+}
+
+func BenchmarkConvT2ILarge(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i1 = tl
+	}
+}
+
+func BenchmarkConvI2E(b *testing.B) {
+	i2 = tm
+	for i := 0; i < b.N; i++ {
+		e = i2
+	}
+}
+
+func BenchmarkConvI2I(b *testing.B) {
+	i2 = tm
+	for i := 0; i < b.N; i++ {
+		i1 = i2
+	}
+}
+
+func BenchmarkAssertE2T(b *testing.B) {
+	e = tm
+	for i := 0; i < b.N; i++ {
+		tm = e.(TM)
+	}
+}
+
+func BenchmarkAssertE2TLarge(b *testing.B) {
+	e = tl
+	for i := 0; i < b.N; i++ {
+		tl = e.(TL)
+	}
+}
+
+func BenchmarkAssertE2I(b *testing.B) {
+	e = tm
+	for i := 0; i < b.N; i++ {
+		i1 = e.(I1)
+	}
+}
+
+func BenchmarkAssertI2T(b *testing.B) {
+	i1 = tm
+	for i := 0; i < b.N; i++ {
+		tm = i1.(TM)
+	}
+}
+
+func BenchmarkAssertI2I(b *testing.B) {
+	i1 = tm
+	for i := 0; i < b.N; i++ {
+		i2 = i1.(I2)
+	}
+}
+
+func BenchmarkAssertI2E(b *testing.B) {
+	i1 = tm
+	for i := 0; i < b.N; i++ {
+		e = i1.(interface{})
+	}
+}
+
+func BenchmarkAssertE2E(b *testing.B) {
+	e = tm
+	for i := 0; i < b.N; i++ {
+		e_ = e
+	}
+}
diff --git a/src/runtime/lfstack.c b/src/runtime/lfstack.c
new file mode 100644
index 000000000..57e0af282
--- /dev/null
+++ b/src/runtime/lfstack.c
@@ -0,0 +1,87 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lock-free stack.
+// The following code runs only on g0 stack.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+
+#ifdef _64BIT
+// Amd64 uses 48-bit virtual addresses, 47-th bit is used as kernel/user flag.
+// So we use 17msb of pointers as ABA counter.
+# define PTR_BITS 47
+#else
+# define PTR_BITS 32
+#endif
+#define PTR_MASK ((1ull<<PTR_BITS)-1)
+#define CNT_MASK (0ull-1)
+
+#ifdef _64BIT
+#ifdef GOOS_solaris
+// SPARC64 and Solaris on AMD64 uses all 64 bits of virtual addresses.
+// Use low-order three bits as ABA counter.
+// http://docs.oracle.com/cd/E19120-01/open.solaris/816-5138/6mba6ua5p/index.html
+#undef PTR_BITS
+#undef CNT_MASK
+#undef PTR_MASK
+#define PTR_BITS 0
+#define CNT_MASK 7
+#define PTR_MASK ((0ull-1)<<3)
+#endif
+#endif
+
+void
+runtime·lfstackpush(uint64 *head, LFNode *node)
+{
+	uint64 old, new;
+
+	if((uintptr)node != ((uintptr)node&PTR_MASK)) {
+		runtime·printf("p=%p\n", node);
+		runtime·throw("runtime·lfstackpush: invalid pointer");
+	}
+
+	node->pushcnt++;
+	new = (uint64)(uintptr)node|(((uint64)node->pushcnt&CNT_MASK)<<PTR_BITS);
+	for(;;) {
+		old = runtime·atomicload64(head);
+		node->next = (LFNode*)(uintptr)(old&PTR_MASK);
+		if(runtime·cas64(head, old, new))
+			break;
+	}
+}
+
+LFNode*
+runtime·lfstackpop(uint64 *head)
+{
+	LFNode *node, *node2;
+	uint64 old, new;
+
+	for(;;) {
+		old = runtime·atomicload64(head);
+		if(old == 0)
+			return nil;
+		node = (LFNode*)(uintptr)(old&PTR_MASK);
+		node2 = runtime·atomicloadp(&node->next);
+		new = 0;
+		if(node2 != nil)
+			new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt&CNT_MASK)<<PTR_BITS);
+		if(runtime·cas64(head, old, new))
+			return node;
+	}
+}
+
+void
+runtime·lfstackpush_m(void)
+{
+	runtime·lfstackpush(g->m->ptrarg[0], g->m->ptrarg[1]);
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+}
+
+void
+runtime·lfstackpop_m(void)
+{
+	g->m->ptrarg[0] = runtime·lfstackpop(g->m->ptrarg[0]);
+}
diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go
new file mode 100644
index 000000000..e51877704
--- /dev/null
+++ b/src/runtime/lfstack_test.go
@@ -0,0 +1,136 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math/rand"
+	. "runtime"
+	"testing"
+	"unsafe"
+)
+
+type MyNode struct {
+	LFNode
+	data int
+}
+
+func fromMyNode(node *MyNode) *LFNode {
+	return (*LFNode)(unsafe.Pointer(node))
+}
+
+func toMyNode(node *LFNode) *MyNode {
+	return (*MyNode)(unsafe.Pointer(node))
+}
+
+func TestLFStack(t *testing.T) {
+	stack := new(uint64)
+	// Need to keep additional referenfces to nodes, the stack is not all that type-safe.
+	var nodes []*MyNode
+
+	// Check the stack is initially empty.
+	if LFStackPop(stack) != nil {
+		t.Fatalf("stack is not empty")
+	}
+
+	// Push one element.
+	node := &MyNode{data: 42}
+	nodes = append(nodes, node)
+	LFStackPush(stack, fromMyNode(node))
+
+	// Push another.
+	node = &MyNode{data: 43}
+	nodes = append(nodes, node)
+	LFStackPush(stack, fromMyNode(node))
+
+	// Pop one element.
+	node = toMyNode(LFStackPop(stack))
+	if node == nil {
+		t.Fatalf("stack is empty")
+	}
+	if node.data != 43 {
+		t.Fatalf("no lifo")
+	}
+
+	// Pop another.
+	node = toMyNode(LFStackPop(stack))
+	if node == nil {
+		t.Fatalf("stack is empty")
+	}
+	if node.data != 42 {
+		t.Fatalf("no lifo")
+	}
+
+	// Check the stack is empty again.
+	if LFStackPop(stack) != nil {
+		t.Fatalf("stack is not empty")
+	}
+	if *stack != 0 {
+		t.Fatalf("stack is not empty")
+	}
+}
+
+var stress []*MyNode
+
+func TestLFStackStress(t *testing.T) {
+	const K = 100
+	P := 4 * GOMAXPROCS(-1)
+	N := 100000
+	if testing.Short() {
+		N /= 10
+	}
+	// Create 2 stacks.
+	stacks := [2]*uint64{new(uint64), new(uint64)}
+	// Need to keep additional references to nodes,
+	// the lock-free stack is not type-safe.
+	stress = nil
+	// Push K elements randomly onto the stacks.
+	sum := 0
+	for i := 0; i < K; i++ {
+		sum += i
+		node := &MyNode{data: i}
+		stress = append(stress, node)
+		LFStackPush(stacks[i%2], fromMyNode(node))
+	}
+	c := make(chan bool, P)
+	for p := 0; p < P; p++ {
+		go func() {
+			r := rand.New(rand.NewSource(rand.Int63()))
+			// Pop a node from a random stack, then push it onto a random stack.
+			for i := 0; i < N; i++ {
+				node := toMyNode(LFStackPop(stacks[r.Intn(2)]))
+				if node != nil {
+					LFStackPush(stacks[r.Intn(2)], fromMyNode(node))
+				}
+			}
+			c <- true
+		}()
+	}
+	for i := 0; i < P; i++ {
+		<-c
+	}
+	// Pop all elements from both stacks, and verify that nothing lost.
+	sum2 := 0
+	cnt := 0
+	for i := 0; i < 2; i++ {
+		for {
+			node := toMyNode(LFStackPop(stacks[i]))
+			if node == nil {
+				break
+			}
+			cnt++
+			sum2 += node.data
+			node.Next = nil
+		}
+	}
+	if cnt != K {
+		t.Fatalf("Wrong number of nodes %d/%d", cnt, K)
+	}
+	if sum2 != sum {
+		t.Fatalf("Wrong sum %d/%d", sum2, sum)
+	}
+
+	// Let nodes be collected now.
+	stress = nil
+}
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
new file mode 100644
index 000000000..725962341
--- /dev/null
+++ b/src/runtime/lock_futex.go
@@ -0,0 +1,205 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build dragonfly freebsd linux
+
+package runtime
+
+import "unsafe"
+
+// This implementation depends on OS-specific implementations of
+//
+//	runtime·futexsleep(uint32 *addr, uint32 val, int64 ns)
+//		Atomically,
+//			if(*addr == val) sleep
+//		Might be woken up spuriously; that's allowed.
+//		Don't sleep longer than ns; ns < 0 means forever.
+//
+//	runtime·futexwakeup(uint32 *addr, uint32 cnt)
+//		If any procs are sleeping on addr, wake up at most cnt.
+
+const (
+	mutex_unlocked = 0
+	mutex_locked   = 1
+	mutex_sleeping = 2
+
+	active_spin     = 4
+	active_spin_cnt = 30
+	passive_spin    = 1
+)
+
+// Possible lock states are mutex_unlocked, mutex_locked and mutex_sleeping.
+// mutex_sleeping means that there is presumably at least one sleeping thread.
+// Note that there can be spinning threads during all states - they do not
+// affect mutex's state.
+
+func futexsleep(addr *uint32, val uint32, ns int64)
+func futexwakeup(addr *uint32, cnt uint32)
+
+// We use the uintptr mutex.key and note.key as a uint32.
+func key32(p *uintptr) *uint32 {
+	return (*uint32)(unsafe.Pointer(p))
+}
+
+func lock(l *mutex) {
+	gp := getg()
+
+	if gp.m.locks < 0 {
+		gothrow("runtime·lock: lock count")
+	}
+	gp.m.locks++
+
+	// Speculative grab for lock.
+	v := xchg(key32(&l.key), mutex_locked)
+	if v == mutex_unlocked {
+		return
+	}
+
+	// wait is either MUTEX_LOCKED or MUTEX_SLEEPING
+	// depending on whether there is a thread sleeping
+	// on this mutex.  If we ever change l->key from
+	// MUTEX_SLEEPING to some other value, we must be
+	// careful to change it back to MUTEX_SLEEPING before
+	// returning, to ensure that the sleeping thread gets
+	// its wakeup call.
+	wait := v
+
+	// On uniprocessors, no point spinning.
+	// On multiprocessors, spin for ACTIVE_SPIN attempts.
+	spin := 0
+	if ncpu > 1 {
+		spin = active_spin
+	}
+	for {
+		// Try for lock, spinning.
+		for i := 0; i < spin; i++ {
+			for l.key == mutex_unlocked {
+				if cas(key32(&l.key), mutex_unlocked, wait) {
+					return
+				}
+			}
+			procyield(active_spin_cnt)
+		}
+
+		// Try for lock, rescheduling.
+		for i := 0; i < passive_spin; i++ {
+			for l.key == mutex_unlocked {
+				if cas(key32(&l.key), mutex_unlocked, wait) {
+					return
+				}
+			}
+			osyield()
+		}
+
+		// Sleep.
+		v = xchg(key32(&l.key), mutex_sleeping)
+		if v == mutex_unlocked {
+			return
+		}
+		wait = mutex_sleeping
+		futexsleep(key32(&l.key), mutex_sleeping, -1)
+	}
+}
+
+func unlock(l *mutex) {
+	v := xchg(key32(&l.key), mutex_unlocked)
+	if v == mutex_unlocked {
+		gothrow("unlock of unlocked lock")
+	}
+	if v == mutex_sleeping {
+		futexwakeup(key32(&l.key), 1)
+	}
+
+	gp := getg()
+	gp.m.locks--
+	if gp.m.locks < 0 {
+		gothrow("runtime·unlock: lock count")
+	}
+	if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
+		gp.stackguard0 = stackPreempt
+	}
+}
+
+// One-time notifications.
+func noteclear(n *note) {
+	n.key = 0
+}
+
+func notewakeup(n *note) {
+	old := xchg(key32(&n.key), 1)
+	if old != 0 {
+		print("notewakeup - double wakeup (", old, ")\n")
+		gothrow("notewakeup - double wakeup")
+	}
+	futexwakeup(key32(&n.key), 1)
+}
+
+func notesleep(n *note) {
+	gp := getg()
+	if gp != gp.m.g0 {
+		gothrow("notesleep not on g0")
+	}
+	for atomicload(key32(&n.key)) == 0 {
+		gp.m.blocked = true
+		futexsleep(key32(&n.key), 0, -1)
+		gp.m.blocked = false
+	}
+}
+
+//go:nosplit
+func notetsleep_internal(n *note, ns int64) bool {
+	gp := getg()
+
+	if ns < 0 {
+		for atomicload(key32(&n.key)) == 0 {
+			gp.m.blocked = true
+			futexsleep(key32(&n.key), 0, -1)
+			gp.m.blocked = false
+		}
+		return true
+	}
+
+	if atomicload(key32(&n.key)) != 0 {
+		return true
+	}
+
+	deadline := nanotime() + ns
+	for {
+		gp.m.blocked = true
+		futexsleep(key32(&n.key), 0, ns)
+		gp.m.blocked = false
+		if atomicload(key32(&n.key)) != 0 {
+			break
+		}
+		now := nanotime()
+		if now >= deadline {
+			break
+		}
+		ns = deadline - now
+	}
+	return atomicload(key32(&n.key)) != 0
+}
+
+func notetsleep(n *note, ns int64) bool {
+	gp := getg()
+	if gp != gp.m.g0 && gp.m.gcing == 0 {
+		gothrow("notetsleep not on g0")
+	}
+
+	return notetsleep_internal(n, ns)
+}
+
+// same as runtime·notetsleep, but called on user g (not g0)
+// calls only nosplit functions between entersyscallblock/exitsyscall
+func notetsleepg(n *note, ns int64) bool {
+	gp := getg()
+	if gp == gp.m.g0 {
+		gothrow("notetsleepg on g0")
+	}
+
+	entersyscallblock()
+	ok := notetsleep_internal(n, ns)
+	exitsyscall()
+	return ok
+}
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
new file mode 100644
index 000000000..d136b8280
--- /dev/null
+++ b/src/runtime/lock_sema.go
@@ -0,0 +1,270 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin nacl netbsd openbsd plan9 solaris windows
+
+package runtime
+
+import "unsafe"
+
+// This implementation depends on OS-specific implementations of
+//
+//	uintptr runtime·semacreate(void)
+//		Create a semaphore, which will be assigned to m->waitsema.
+//		The zero value is treated as absence of any semaphore,
+//		so be sure to return a non-zero value.
+//
+//	int32 runtime·semasleep(int64 ns)
+//		If ns < 0, acquire m->waitsema and return 0.
+//		If ns >= 0, try to acquire m->waitsema for at most ns nanoseconds.
+//		Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
+//
+//	int32 runtime·semawakeup(M *mp)
+//		Wake up mp, which is or will soon be sleeping on mp->waitsema.
+//
+const (
+	locked uintptr = 1
+
+	active_spin     = 4
+	active_spin_cnt = 30
+	passive_spin    = 1
+)
+
+func semacreate() uintptr
+func semasleep(int64) int32
+func semawakeup(mp *m)
+
+func lock(l *mutex) {
+	gp := getg()
+	if gp.m.locks < 0 {
+		gothrow("runtime·lock: lock count")
+	}
+	gp.m.locks++
+
+	// Speculative grab for lock.
+	if casuintptr(&l.key, 0, locked) {
+		return
+	}
+	if gp.m.waitsema == 0 {
+		gp.m.waitsema = semacreate()
+	}
+
+	// On uniprocessor's, no point spinning.
+	// On multiprocessors, spin for ACTIVE_SPIN attempts.
+	spin := 0
+	if ncpu > 1 {
+		spin = active_spin
+	}
+Loop:
+	for i := 0; ; i++ {
+		v := atomicloaduintptr(&l.key)
+		if v&locked == 0 {
+			// Unlocked. Try to lock.
+			if casuintptr(&l.key, v, v|locked) {
+				return
+			}
+			i = 0
+		}
+		if i < spin {
+			procyield(active_spin_cnt)
+		} else if i < spin+passive_spin {
+			osyield()
+		} else {
+			// Someone else has it.
+			// l->waitm points to a linked list of M's waiting
+			// for this lock, chained through m->nextwaitm.
+			// Queue this M.
+			for {
+				gp.m.nextwaitm = (*m)((unsafe.Pointer)(v &^ locked))
+				if casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
+					break
+				}
+				v = atomicloaduintptr(&l.key)
+				if v&locked == 0 {
+					continue Loop
+				}
+			}
+			if v&locked != 0 {
+				// Queued.  Wait.
+				semasleep(-1)
+				i = 0
+			}
+		}
+	}
+}
+
+func unlock(l *mutex) {
+	gp := getg()
+	var mp *m
+	for {
+		v := atomicloaduintptr(&l.key)
+		if v == locked {
+			if casuintptr(&l.key, locked, 0) {
+				break
+			}
+		} else {
+			// Other M's are waiting for the lock.
+			// Dequeue an M.
+			mp = (*m)((unsafe.Pointer)(v &^ locked))
+			if casuintptr(&l.key, v, uintptr(unsafe.Pointer(mp.nextwaitm))) {
+				// Dequeued an M.  Wake it.
+				semawakeup(mp)
+				break
+			}
+		}
+	}
+	gp.m.locks--
+	if gp.m.locks < 0 {
+		gothrow("runtime·unlock: lock count")
+	}
+	if gp.m.locks == 0 && gp.preempt { // restore the preemption request in case we've cleared it in newstack
+		gp.stackguard0 = stackPreempt
+	}
+}
+
+// One-time notifications.
+func noteclear(n *note) {
+	n.key = 0
+}
+
+func notewakeup(n *note) {
+	var v uintptr
+	for {
+		v = atomicloaduintptr(&n.key)
+		if casuintptr(&n.key, v, locked) {
+			break
+		}
+	}
+
+	// Successfully set waitm to locked.
+	// What was it before?
+	switch {
+	case v == 0:
+		// Nothing was waiting. Done.
+	case v == locked:
+		// Two notewakeups!  Not allowed.
+		gothrow("notewakeup - double wakeup")
+	default:
+		// Must be the waiting m.  Wake it up.
+		semawakeup((*m)(unsafe.Pointer(v)))
+	}
+}
+
+func notesleep(n *note) {
+	gp := getg()
+	if gp != gp.m.g0 {
+		gothrow("notesleep not on g0")
+	}
+	if gp.m.waitsema == 0 {
+		gp.m.waitsema = semacreate()
+	}
+	if !casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+		// Must be locked (got wakeup).
+		if n.key != locked {
+			gothrow("notesleep - waitm out of sync")
+		}
+		return
+	}
+	// Queued.  Sleep.
+	gp.m.blocked = true
+	semasleep(-1)
+	gp.m.blocked = false
+}
+
+//go:nosplit
+func notetsleep_internal(n *note, ns int64, gp *g, deadline int64) bool {
+	// gp and deadline are logically local variables, but they are written
+	// as parameters so that the stack space they require is charged
+	// to the caller.
+	// This reduces the nosplit footprint of notetsleep_internal.
+	gp = getg()
+
+	// Register for wakeup on n->waitm.
+	if !casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+		// Must be locked (got wakeup).
+		if n.key != locked {
+			gothrow("notetsleep - waitm out of sync")
+		}
+		return true
+	}
+	if ns < 0 {
+		// Queued.  Sleep.
+		gp.m.blocked = true
+		semasleep(-1)
+		gp.m.blocked = false
+		return true
+	}
+
+	deadline = nanotime() + ns
+	for {
+		// Registered.  Sleep.
+		gp.m.blocked = true
+		if semasleep(ns) >= 0 {
+			gp.m.blocked = false
+			// Acquired semaphore, semawakeup unregistered us.
+			// Done.
+			return true
+		}
+		gp.m.blocked = false
+		// Interrupted or timed out.  Still registered.  Semaphore not acquired.
+		ns = deadline - nanotime()
+		if ns <= 0 {
+			break
+		}
+		// Deadline hasn't arrived.  Keep sleeping.
+	}
+
+	// Deadline arrived.  Still registered.  Semaphore not acquired.
+	// Want to give up and return, but have to unregister first,
+	// so that any notewakeup racing with the return does not
+	// try to grant us the semaphore when we don't expect it.
+	for {
+		v := atomicloaduintptr(&n.key)
+		switch v {
+		case uintptr(unsafe.Pointer(gp.m)):
+			// No wakeup yet; unregister if possible.
+			if casuintptr(&n.key, v, 0) {
+				return false
+			}
+		case locked:
+			// Wakeup happened so semaphore is available.
+			// Grab it to avoid getting out of sync.
+			gp.m.blocked = true
+			if semasleep(-1) < 0 {
+				gothrow("runtime: unable to acquire - semaphore out of sync")
+			}
+			gp.m.blocked = false
+			return true
+		default:
+			gothrow("runtime: unexpected waitm - semaphore out of sync")
+		}
+	}
+}
+
+func notetsleep(n *note, ns int64) bool {
+	gp := getg()
+	if gp != gp.m.g0 && gp.m.gcing == 0 {
+		gothrow("notetsleep not on g0")
+	}
+	if gp.m.waitsema == 0 {
+		gp.m.waitsema = semacreate()
+	}
+	return notetsleep_internal(n, ns, nil, 0)
+}
+
+// same as runtime·notetsleep, but called on user g (not g0)
+// calls only nosplit functions between entersyscallblock/exitsyscall
+func notetsleepg(n *note, ns int64) bool {
+	gp := getg()
+	if gp == gp.m.g0 {
+		gothrow("notetsleepg on g0")
+	}
+	if gp.m.waitsema == 0 {
+		gp.m.waitsema = semacreate()
+	}
+	entersyscallblock()
+	ok := notetsleep_internal(n, ns, nil, 0)
+	exitsyscall()
+	return ok
+}
diff --git a/src/runtime/malloc.c b/src/runtime/malloc.c
new file mode 100644
index 000000000..b79c30b72
--- /dev/null
+++ b/src/runtime/malloc.c
@@ -0,0 +1,396 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// See malloc.h for overview.
+//
+// TODO(rsc): double-check stats.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "type.h"
+#include "typekind.h"
+#include "race.h"
+#include "stack.h"
+#include "textflag.h"
+
+// Mark mheap as 'no pointers', it does not contain interesting pointers but occupies ~45K.
+#pragma dataflag NOPTR
+MHeap runtime·mheap;
+#pragma dataflag NOPTR
+MStats runtime·memstats;
+
+int32
+runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
+{
+	uintptr n, i;
+	byte *p;
+	MSpan *s;
+
+	g->m->mcache->local_nlookup++;
+	if (sizeof(void*) == 4 && g->m->mcache->local_nlookup >= (1<<30)) {
+		// purge cache stats to prevent overflow
+		runtime·lock(&runtime·mheap.lock);
+		runtime·purgecachedstats(g->m->mcache);
+		runtime·unlock(&runtime·mheap.lock);
+	}
+
+	s = runtime·MHeap_LookupMaybe(&runtime·mheap, v);
+	if(sp)
+		*sp = s;
+	if(s == nil) {
+		if(base)
+			*base = nil;
+		if(size)
+			*size = 0;
+		return 0;
+	}
+
+	p = (byte*)((uintptr)s->start<<PageShift);
+	if(s->sizeclass == 0) {
+		// Large object.
+		if(base)
+			*base = p;
+		if(size)
+			*size = s->npages<<PageShift;
+		return 1;
+	}
+
+	n = s->elemsize;
+	if(base) {
+		i = ((byte*)v - p)/n;
+		*base = p + i*n;
+	}
+	if(size)
+		*size = n;
+
+	return 1;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·purgecachedstats(MCache *c)
+{
+	MHeap *h;
+	int32 i;
+
+	// Protected by either heap or GC lock.
+	h = &runtime·mheap;
+	mstats.heap_alloc += c->local_cachealloc;
+	c->local_cachealloc = 0;
+	mstats.tinyallocs += c->local_tinyallocs;
+	c->local_tinyallocs = 0;
+	mstats.nlookup += c->local_nlookup;
+	c->local_nlookup = 0;
+	h->largefree += c->local_largefree;
+	c->local_largefree = 0;
+	h->nlargefree += c->local_nlargefree;
+	c->local_nlargefree = 0;
+	for(i=0; i<nelem(c->local_nsmallfree); i++) {
+		h->nsmallfree[i] += c->local_nsmallfree[i];
+		c->local_nsmallfree[i] = 0;
+	}
+}
+
+// Size of the trailing by_size array differs between Go and C,
+// and all data after by_size is local to C, not exported to Go.
+// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+// sizeof_C_MStats is what C thinks about size of Go struct.
+uintptr runtime·sizeof_C_MStats = offsetof(MStats, by_size[61]);
+
+#define MaxArena32 (2U<<30)
+
+// For use by Go. If it were a C enum it would be made available automatically,
+// but the value of MaxMem is too large for enum.
+uintptr runtime·maxmem = MaxMem;
+
+void
+runtime·mallocinit(void)
+{
+	byte *p, *p1;
+	uintptr arena_size, bitmap_size, spans_size, p_size;
+	extern byte runtime·end[];
+	uintptr limit;
+	uint64 i;
+	bool reserved;
+
+	p = nil;
+	p_size = 0;
+	arena_size = 0;
+	bitmap_size = 0;
+	spans_size = 0;
+	reserved = false;
+
+	// for 64-bit build
+	USED(p);
+	USED(p_size);
+	USED(arena_size);
+	USED(bitmap_size);
+	USED(spans_size);
+
+	runtime·InitSizes();
+
+	if(runtime·class_to_size[TinySizeClass] != TinySize)
+		runtime·throw("bad TinySizeClass");
+
+	// limit = runtime·memlimit();
+	// See https://code.google.com/p/go/issues/detail?id=5049
+	// TODO(rsc): Fix after 1.1.
+	limit = 0;
+
+	// Set up the allocation arena, a contiguous area of memory where
+	// allocated data will be found.  The arena begins with a bitmap large
+	// enough to hold 4 bits per allocated word.
+	if(sizeof(void*) == 8 && (limit == 0 || limit > (1<<30))) {
+		// On a 64-bit machine, allocate from a single contiguous reservation.
+		// 128 GB (MaxMem) should be big enough for now.
+		//
+		// The code will work with the reservation at any address, but ask
+		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
+		// Allocating a 128 GB region takes away 37 bits, and the amd64
+		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
+		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
+		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from 
+		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
+		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
+		// These choices are both for debuggability and to reduce the
+		// odds of the conservative garbage collector not collecting memory
+		// because some non-pointer block of memory had a bit pattern
+		// that matched a memory address.
+		//
+		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
+		// but it hardly matters: e0 00 is not valid UTF-8 either.
+		//
+		// If this fails we fall back to the 32 bit memory mechanism
+		arena_size = MaxMem;
+		bitmap_size = arena_size / (sizeof(void*)*8/4);
+		spans_size = arena_size / PageSize * sizeof(runtime·mheap.spans[0]);
+		spans_size = ROUND(spans_size, PageSize);
+		for(i = 0; i <= 0x7f; i++) {
+			p = (void*)(i<<40 | 0x00c0ULL<<32);
+			p_size = bitmap_size + spans_size + arena_size + PageSize;
+			p = runtime·SysReserve(p, p_size, &reserved);
+			if(p != nil)
+				break;
+		}
+	}
+	if (p == nil) {
+		// On a 32-bit machine, we can't typically get away
+		// with a giant virtual address space reservation.
+		// Instead we map the memory information bitmap
+		// immediately after the data segment, large enough
+		// to handle another 2GB of mappings (256 MB),
+		// along with a reservation for another 512 MB of memory.
+		// When that gets used up, we'll start asking the kernel
+		// for any memory anywhere and hope it's in the 2GB
+		// following the bitmap (presumably the executable begins
+		// near the bottom of memory, so we'll have to use up
+		// most of memory before the kernel resorts to giving out
+		// memory before the beginning of the text segment).
+		//
+		// Alternatively we could reserve 512 MB bitmap, enough
+		// for 4GB of mappings, and then accept any memory the
+		// kernel threw at us, but normally that's a waste of 512 MB
+		// of address space, which is probably too much in a 32-bit world.
+		bitmap_size = MaxArena32 / (sizeof(void*)*8/4);
+		arena_size = 512<<20;
+		spans_size = MaxArena32 / PageSize * sizeof(runtime·mheap.spans[0]);
+		if(limit > 0 && arena_size+bitmap_size+spans_size > limit) {
+			bitmap_size = (limit / 9) & ~((1<<PageShift) - 1);
+			arena_size = bitmap_size * 8;
+			spans_size = arena_size / PageSize * sizeof(runtime·mheap.spans[0]);
+		}
+		spans_size = ROUND(spans_size, PageSize);
+
+		// SysReserve treats the address we ask for, end, as a hint,
+		// not as an absolute requirement.  If we ask for the end
+		// of the data segment but the operating system requires
+		// a little more space before we can start allocating, it will
+		// give out a slightly higher pointer.  Except QEMU, which
+		// is buggy, as usual: it won't adjust the pointer upward.
+		// So adjust it upward a little bit ourselves: 1/4 MB to get
+		// away from the running binary image and then round up
+		// to a MB boundary.
+		p = (byte*)ROUND((uintptr)runtime·end + (1<<18), 1<<20);
+		p_size = bitmap_size + spans_size + arena_size + PageSize;
+		p = runtime·SysReserve(p, p_size, &reserved);
+		if(p == nil)
+			runtime·throw("runtime: cannot reserve arena virtual address space");
+	}
+
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p1 = (byte*)ROUND((uintptr)p, PageSize);
+
+	runtime·mheap.spans = (MSpan**)p1;
+	runtime·mheap.bitmap = p1 + spans_size;
+	runtime·mheap.arena_start = p1 + spans_size + bitmap_size;
+	runtime·mheap.arena_used = runtime·mheap.arena_start;
+	runtime·mheap.arena_end = p + p_size;
+	runtime·mheap.arena_reserved = reserved;
+
+	if(((uintptr)runtime·mheap.arena_start & (PageSize-1)) != 0)
+		runtime·throw("misrounded allocation in mallocinit");
+
+	// Initialize the rest of the allocator.	
+	runtime·MHeap_Init(&runtime·mheap);
+	g->m->mcache = runtime·allocmcache();
+}
+
+void*
+runtime·MHeap_SysAlloc(MHeap *h, uintptr n)
+{
+	byte *p, *p_end;
+	uintptr p_size;
+	bool reserved;
+
+	if(n > h->arena_end - h->arena_used) {
+		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
+		// Reserve some more space.
+		byte *new_end;
+
+		p_size = ROUND(n + PageSize, 256<<20);
+		new_end = h->arena_end + p_size;
+		if(new_end <= h->arena_start + MaxArena32) {
+			// TODO: It would be bad if part of the arena
+			// is reserved and part is not.
+			p = runtime·SysReserve(h->arena_end, p_size, &reserved);
+			if(p == h->arena_end) {
+				h->arena_end = new_end;
+				h->arena_reserved = reserved;
+			}
+			else if(p+p_size <= h->arena_start + MaxArena32) {
+				// Keep everything page-aligned.
+				// Our pages are bigger than hardware pages.
+				h->arena_end = p+p_size;
+				h->arena_used = p + (-(uintptr)p&(PageSize-1));
+				h->arena_reserved = reserved;
+			} else {
+				uint64 stat;
+				stat = 0;
+				runtime·SysFree(p, p_size, &stat);
+			}
+		}
+	}
+	if(n <= h->arena_end - h->arena_used) {
+		// Keep taking from our reservation.
+		p = h->arena_used;
+		runtime·SysMap(p, n, h->arena_reserved, &mstats.heap_sys);
+		h->arena_used += n;
+		runtime·MHeap_MapBits(h);
+		runtime·MHeap_MapSpans(h);
+		if(raceenabled)
+			runtime·racemapshadow(p, n);
+		
+		if(((uintptr)p & (PageSize-1)) != 0)
+			runtime·throw("misrounded allocation in MHeap_SysAlloc");
+		return p;
+	}
+	
+	// If using 64-bit, our reservation is all we have.
+	if(h->arena_end - h->arena_start >= MaxArena32)
+		return nil;
+
+	// On 32-bit, once the reservation is gone we can
+	// try to get memory at a location chosen by the OS
+	// and hope that it is in the range we allocated bitmap for.
+	p_size = ROUND(n, PageSize) + PageSize;
+	p = runtime·sysAlloc(p_size, &mstats.heap_sys);
+	if(p == nil)
+		return nil;
+
+	if(p < h->arena_start || p+p_size - h->arena_start >= MaxArena32) {
+		runtime·printf("runtime: memory allocated by OS (%p) not in usable range [%p,%p)\n",
+			p, h->arena_start, h->arena_start+MaxArena32);
+		runtime·SysFree(p, p_size, &mstats.heap_sys);
+		return nil;
+	}
+	
+	p_end = p + p_size;
+	p += -(uintptr)p & (PageSize-1);
+	if(p+n > h->arena_used) {
+		h->arena_used = p+n;
+		if(p_end > h->arena_end)
+			h->arena_end = p_end;
+		runtime·MHeap_MapBits(h);
+		runtime·MHeap_MapSpans(h);
+		if(raceenabled)
+			runtime·racemapshadow(p, n);
+	}
+	
+	if(((uintptr)p & (PageSize-1)) != 0)
+		runtime·throw("misrounded allocation in MHeap_SysAlloc");
+	return p;
+}
+
+void
+runtime·setFinalizer_m(void)
+{
+	FuncVal *fn;
+	void *arg;
+	uintptr nret;
+	Type *fint;
+	PtrType *ot;
+
+	fn = g->m->ptrarg[0];
+	arg = g->m->ptrarg[1];
+	nret = g->m->scalararg[0];
+	fint = g->m->ptrarg[2];
+	ot = g->m->ptrarg[3];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+	g->m->ptrarg[2] = nil;
+	g->m->ptrarg[3] = nil;
+
+	g->m->scalararg[0] = runtime·addfinalizer(arg, fn, nret, fint, ot);
+}
+
+void
+runtime·removeFinalizer_m(void)
+{
+	void *p;
+
+	p = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	runtime·removefinalizer(p);
+}
+
+// mcallable cache refill
+void 
+runtime·mcacheRefill_m(void)
+{
+	runtime·MCache_Refill(g->m->mcache, (int32)g->m->scalararg[0]);
+}
+
+void
+runtime·largeAlloc_m(void)
+{
+	uintptr npages, size;
+	MSpan *s;
+	void *v;
+	int32 flag;
+
+	//runtime·printf("largeAlloc size=%D\n", g->m->scalararg[0]);
+	// Allocate directly from heap.
+	size = g->m->scalararg[0];
+	flag = (int32)g->m->scalararg[1];
+	if(size + PageSize < size)
+		runtime·throw("out of memory");
+	npages = size >> PageShift;
+	if((size & PageMask) != 0)
+		npages++;
+	s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
+	if(s == nil)
+		runtime·throw("out of memory");
+	s->limit = (byte*)(s->start<<PageShift) + size;
+	v = (void*)(s->start << PageShift);
+	// setup for mark sweep
+	runtime·markspan(v, 0, 0, true);
+	g->m->ptrarg[0] = s;
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
new file mode 100644
index 000000000..117044944
--- /dev/null
+++ b/src/runtime/malloc.go
@@ -0,0 +1,837 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const (
+	debugMalloc = false
+
+	flagNoScan = _FlagNoScan
+	flagNoZero = _FlagNoZero
+
+	maxTinySize   = _TinySize
+	tinySizeClass = _TinySizeClass
+	maxSmallSize  = _MaxSmallSize
+
+	pageShift = _PageShift
+	pageSize  = _PageSize
+	pageMask  = _PageMask
+
+	bitsPerPointer  = _BitsPerPointer
+	bitsMask        = _BitsMask
+	pointersPerByte = _PointersPerByte
+	maxGCMask       = _MaxGCMask
+	bitsDead        = _BitsDead
+	bitsPointer     = _BitsPointer
+
+	mSpanInUse = _MSpanInUse
+
+	concurrentSweep = _ConcurrentSweep != 0
+)
+
+// Page number (address>>pageShift)
+type pageID uintptr
+
+// base address for all 0-byte allocations
+var zerobase uintptr
+
+// Allocate an object of size bytes.
+// Small objects are allocated from the per-P cache's free lists.
+// Large objects (> 32 kB) are allocated straight from the heap.
+func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
+	if size == 0 {
+		return unsafe.Pointer(&zerobase)
+	}
+	size0 := size
+
+	if flags&flagNoScan == 0 && typ == nil {
+		gothrow("malloc missing type")
+	}
+
+	// This function must be atomic wrt GC, but for performance reasons
+	// we don't acquirem/releasem on fast path. The code below does not have
+	// split stack checks, so it can't be preempted by GC.
+	// Functions like roundup/add are inlined. And onM/racemalloc are nosplit.
+	// If debugMalloc = true, these assumptions are checked below.
+	if debugMalloc {
+		mp := acquirem()
+		if mp.mallocing != 0 {
+			gothrow("malloc deadlock")
+		}
+		mp.mallocing = 1
+		if mp.curg != nil {
+			mp.curg.stackguard0 = ^uintptr(0xfff) | 0xbad
+		}
+	}
+
+	c := gomcache()
+	var s *mspan
+	var x unsafe.Pointer
+	if size <= maxSmallSize {
+		if flags&flagNoScan != 0 && size < maxTinySize {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be FlagNoScan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (maxTinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= maxTinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+			tinysize := uintptr(c.tinysize)
+			if size <= tinysize {
+				tiny := unsafe.Pointer(c.tiny)
+				// Align tiny pointer for required (conservative) alignment.
+				if size&7 == 0 {
+					tiny = roundup(tiny, 8)
+				} else if size&3 == 0 {
+					tiny = roundup(tiny, 4)
+				} else if size&1 == 0 {
+					tiny = roundup(tiny, 2)
+				}
+				size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
+				if size1 <= tinysize {
+					// The object fits into existing tiny block.
+					x = tiny
+					c.tiny = (*byte)(add(x, size))
+					c.tinysize -= uintptr(size1)
+					c.local_tinyallocs++
+					if debugMalloc {
+						mp := acquirem()
+						if mp.mallocing == 0 {
+							gothrow("bad malloc")
+						}
+						mp.mallocing = 0
+						if mp.curg != nil {
+							mp.curg.stackguard0 = mp.curg.stack.lo + _StackGuard
+						}
+						// Note: one releasem for the acquirem just above.
+						// The other for the acquirem at start of malloc.
+						releasem(mp)
+						releasem(mp)
+					}
+					return x
+				}
+			}
+			// Allocate a new maxTinySize block.
+			s = c.alloc[tinySizeClass]
+			v := s.freelist
+			if v == nil {
+				mp := acquirem()
+				mp.scalararg[0] = tinySizeClass
+				onM(mcacheRefill_m)
+				releasem(mp)
+				s = c.alloc[tinySizeClass]
+				v = s.freelist
+			}
+			s.freelist = v.next
+			s.ref++
+			//TODO: prefetch v.next
+			x = unsafe.Pointer(v)
+			(*[2]uint64)(x)[0] = 0
+			(*[2]uint64)(x)[1] = 0
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if maxTinySize-size > tinysize {
+				c.tiny = (*byte)(add(x, size))
+				c.tinysize = uintptr(maxTinySize - size)
+			}
+			size = maxTinySize
+		} else {
+			var sizeclass int8
+			if size <= 1024-8 {
+				sizeclass = size_to_class8[(size+7)>>3]
+			} else {
+				sizeclass = size_to_class128[(size-1024+127)>>7]
+			}
+			size = uintptr(class_to_size[sizeclass])
+			s = c.alloc[sizeclass]
+			v := s.freelist
+			if v == nil {
+				mp := acquirem()
+				mp.scalararg[0] = uintptr(sizeclass)
+				onM(mcacheRefill_m)
+				releasem(mp)
+				s = c.alloc[sizeclass]
+				v = s.freelist
+			}
+			s.freelist = v.next
+			s.ref++
+			//TODO: prefetch
+			x = unsafe.Pointer(v)
+			if flags&flagNoZero == 0 {
+				v.next = nil
+				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
+					memclr(unsafe.Pointer(v), size)
+				}
+			}
+		}
+		c.local_cachealloc += intptr(size)
+	} else {
+		mp := acquirem()
+		mp.scalararg[0] = uintptr(size)
+		mp.scalararg[1] = uintptr(flags)
+		onM(largeAlloc_m)
+		s = (*mspan)(mp.ptrarg[0])
+		mp.ptrarg[0] = nil
+		releasem(mp)
+		x = unsafe.Pointer(uintptr(s.start << pageShift))
+		size = uintptr(s.elemsize)
+	}
+
+	if flags&flagNoScan != 0 {
+		// All objects are pre-marked as noscan.
+		goto marked
+	}
+
+	// If allocating a defer+arg block, now that we've picked a malloc size
+	// large enough to hold everything, cut the "asked for" size down to
+	// just the defer header, so that the GC bitmap will record the arg block
+	// as containing nothing at all (as if it were unused space at the end of
+	// a malloc block caused by size rounding).
+	// The defer arg areas are scanned as part of scanstack.
+	if typ == deferType {
+		size0 = unsafe.Sizeof(_defer{})
+	}
+
+	// From here till marked label marking the object as allocated
+	// and storing type info in the GC bitmap.
+	{
+		arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
+		off := (uintptr(x) - arena_start) / ptrSize
+		xbits := (*uint8)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+		shift := (off % wordsPerBitmapByte) * gcBits
+		if debugMalloc && ((*xbits>>shift)&(bitMask|bitPtrMask)) != bitBoundary {
+			println("runtime: bits =", (*xbits>>shift)&(bitMask|bitPtrMask))
+			gothrow("bad bits in markallocated")
+		}
+
+		var ti, te uintptr
+		var ptrmask *uint8
+		if size == ptrSize {
+			// It's one word and it has pointers, it must be a pointer.
+			*xbits |= (bitsPointer << 2) << shift
+			goto marked
+		}
+		if typ.kind&kindGCProg != 0 {
+			nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+			masksize := nptr
+			if masksize%2 != 0 {
+				masksize *= 2 // repeated
+			}
+			masksize = masksize * pointersPerByte / 8 // 4 bits per word
+			masksize++                                // unroll flag in the beginning
+			if masksize > maxGCMask && typ.gc[1] != 0 {
+				// If the mask is too large, unroll the program directly
+				// into the GC bitmap. It's 7 times slower than copying
+				// from the pre-unrolled mask, but saves 1/16 of type size
+				// memory for the mask.
+				mp := acquirem()
+				mp.ptrarg[0] = x
+				mp.ptrarg[1] = unsafe.Pointer(typ)
+				mp.scalararg[0] = uintptr(size)
+				mp.scalararg[1] = uintptr(size0)
+				onM(unrollgcproginplace_m)
+				releasem(mp)
+				goto marked
+			}
+			ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+			// Check whether the program is already unrolled.
+			if uintptr(atomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
+				mp := acquirem()
+				mp.ptrarg[0] = unsafe.Pointer(typ)
+				onM(unrollgcprog_m)
+				releasem(mp)
+			}
+			ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+		} else {
+			ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+		}
+		if size == 2*ptrSize {
+			*xbits = *ptrmask | bitBoundary
+			goto marked
+		}
+		te = uintptr(typ.size) / ptrSize
+		// If the type occupies odd number of words, its mask is repeated.
+		if te%2 == 0 {
+			te /= 2
+		}
+		// Copy pointer bitmask into the bitmap.
+		for i := uintptr(0); i < size0; i += 2 * ptrSize {
+			v := *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
+			ti++
+			if ti == te {
+				ti = 0
+			}
+			if i == 0 {
+				v |= bitBoundary
+			}
+			if i+ptrSize == size0 {
+				v &^= uint8(bitPtrMask << 4)
+			}
+
+			*xbits = v
+			xbits = (*byte)(add(unsafe.Pointer(xbits), ^uintptr(0)))
+		}
+		if size0%(2*ptrSize) == 0 && size0 < size {
+			// Mark the word after last object's word as bitsDead.
+			*xbits = bitsDead << 2
+		}
+	}
+marked:
+	if raceenabled {
+		racemalloc(x, size)
+	}
+
+	if debugMalloc {
+		mp := acquirem()
+		if mp.mallocing == 0 {
+			gothrow("bad malloc")
+		}
+		mp.mallocing = 0
+		if mp.curg != nil {
+			mp.curg.stackguard0 = mp.curg.stack.lo + _StackGuard
+		}
+		// Note: one releasem for the acquirem just above.
+		// The other for the acquirem at start of malloc.
+		releasem(mp)
+		releasem(mp)
+	}
+
+	if debug.allocfreetrace != 0 {
+		tracealloc(x, size, typ)
+	}
+
+	if rate := MemProfileRate; rate > 0 {
+		if size < uintptr(rate) && int32(size) < c.next_sample {
+			c.next_sample -= int32(size)
+		} else {
+			mp := acquirem()
+			profilealloc(mp, x, size)
+			releasem(mp)
+		}
+	}
+
+	if memstats.heap_alloc >= memstats.next_gc {
+		gogc(0)
+	}
+
+	return x
+}
+
+// implementation of new builtin
+func newobject(typ *_type) unsafe.Pointer {
+	flags := uint32(0)
+	if typ.kind&kindNoPointers != 0 {
+		flags |= flagNoScan
+	}
+	return mallocgc(uintptr(typ.size), typ, flags)
+}
+
+// implementation of make builtin for slices
+func newarray(typ *_type, n uintptr) unsafe.Pointer {
+	flags := uint32(0)
+	if typ.kind&kindNoPointers != 0 {
+		flags |= flagNoScan
+	}
+	if int(n) < 0 || (typ.size > 0 && n > maxmem/uintptr(typ.size)) {
+		panic("runtime: allocation size out of range")
+	}
+	return mallocgc(uintptr(typ.size)*n, typ, flags)
+}
+
+// rawmem returns a chunk of pointerless memory.  It is
+// not zeroed.
+func rawmem(size uintptr) unsafe.Pointer {
+	return mallocgc(size, nil, flagNoScan|flagNoZero)
+}
+
+// round size up to next size class
+func goroundupsize(size uintptr) uintptr {
+	if size < maxSmallSize {
+		if size <= 1024-8 {
+			return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
+		}
+		return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
+	}
+	if size+pageSize < size {
+		return size
+	}
+	return (size + pageSize - 1) &^ pageMask
+}
+
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	c := mp.mcache
+	rate := MemProfileRate
+	if size < uintptr(rate) {
+		// pick next profile time
+		// If you change this, also change allocmcache.
+		if rate > 0x3fffffff { // make 2*rate not overflow
+			rate = 0x3fffffff
+		}
+		next := int32(fastrand1()) % (2 * int32(rate))
+		// Subtract the "remainder" of the current allocation.
+		// Otherwise objects that are close in size to sampling rate
+		// will be under-sampled, because we consistently discard this remainder.
+		next -= (int32(size) - c.next_sample)
+		if next < 0 {
+			next = 0
+		}
+		c.next_sample = next
+	}
+
+	mProf_Malloc(x, size)
+}
+
+// force = 1 - do GC regardless of current heap usage
+// force = 2 - go GC and eager sweep
+func gogc(force int32) {
+	// The gc is turned off (via enablegc) until the bootstrap has completed.
+	// Also, malloc gets called in the guts of a number of libraries that might be
+	// holding locks. To avoid deadlocks during stoptheworld, don't bother
+	// trying to run gc while holding a lock. The next mallocgc without a lock
+	// will do the gc instead.
+	mp := acquirem()
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || !memstats.enablegc || panicking != 0 || gcpercent < 0 {
+		releasem(mp)
+		return
+	}
+	releasem(mp)
+	mp = nil
+
+	semacquire(&worldsema, false)
+
+	if force == 0 && memstats.heap_alloc < memstats.next_gc {
+		// typically threads which lost the race to grab
+		// worldsema exit here when gc is done.
+		semrelease(&worldsema)
+		return
+	}
+
+	// Ok, we're doing it!  Stop everybody else
+	startTime := nanotime()
+	mp = acquirem()
+	mp.gcing = 1
+	releasem(mp)
+	onM(stoptheworld)
+	if mp != acquirem() {
+		gothrow("gogc: rescheduled")
+	}
+
+	clearpools()
+
+	// Run gc on the g0 stack.  We do this so that the g stack
+	// we're currently running on will no longer change.  Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	n := 1
+	if debug.gctrace > 1 {
+		n = 2
+	}
+	for i := 0; i < n; i++ {
+		if i > 0 {
+			startTime = nanotime()
+		}
+		// switch to g0, call gc, then switch back
+		mp.scalararg[0] = uintptr(uint32(startTime)) // low 32 bits
+		mp.scalararg[1] = uintptr(startTime >> 32)   // high 32 bits
+		if force >= 2 {
+			mp.scalararg[2] = 1 // eagersweep
+		} else {
+			mp.scalararg[2] = 0
+		}
+		onM(gc_m)
+	}
+
+	// all done
+	mp.gcing = 0
+	semrelease(&worldsema)
+	onM(starttheworld)
+	releasem(mp)
+	mp = nil
+
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		Gosched()
+	}
+}
+
+// GC runs a garbage collection.
+func GC() {
+	gogc(2)
+}
+
+// linker-provided
+var noptrdata struct{}
+var enoptrdata struct{}
+var noptrbss struct{}
+var enoptrbss struct{}
+
+// SetFinalizer sets the finalizer associated with x to f.
+// When the garbage collector finds an unreachable block
+// with an associated finalizer, it clears the association and runs
+// f(x) in a separate goroutine.  This makes x reachable again, but
+// now without an associated finalizer.  Assuming that SetFinalizer
+// is not called again, the next time the garbage collector sees
+// that x is unreachable, it will free x.
+//
+// SetFinalizer(x, nil) clears any finalizer associated with x.
+//
+// The argument x must be a pointer to an object allocated by
+// calling new or by taking the address of a composite literal.
+// The argument f must be a function that takes a single argument
+// to which x's type can be assigned, and can have arbitrary ignored return
+// values. If either of these is not true, SetFinalizer aborts the
+// program.
+//
+// Finalizers are run in dependency order: if A points at B, both have
+// finalizers, and they are otherwise unreachable, only the finalizer
+// for A runs; once A is freed, the finalizer for B can run.
+// If a cyclic structure includes a block with a finalizer, that
+// cycle is not guaranteed to be garbage collected and the finalizer
+// is not guaranteed to run, because there is no ordering that
+// respects the dependencies.
+//
+// The finalizer for x is scheduled to run at some arbitrary time after
+// x becomes unreachable.
+// There is no guarantee that finalizers will run before a program exits,
+// so typically they are useful only for releasing non-memory resources
+// associated with an object during a long-running program.
+// For example, an os.File object could use a finalizer to close the
+// associated operating system file descriptor when a program discards
+// an os.File without calling Close, but it would be a mistake
+// to depend on a finalizer to flush an in-memory I/O buffer such as a
+// bufio.Writer, because the buffer would not be flushed at program exit.
+//
+// It is not guaranteed that a finalizer will run if the size of *x is
+// zero bytes.
+//
+// It is not guaranteed that a finalizer will run for objects allocated
+// in initializers for package-level variables. Such objects may be
+// linker-allocated, not heap-allocated.
+//
+// A single goroutine runs all finalizers for a program, sequentially.
+// If a finalizer must run for a long time, it should do so by starting
+// a new goroutine.
+func SetFinalizer(obj interface{}, finalizer interface{}) {
+	e := (*eface)(unsafe.Pointer(&obj))
+	etyp := e._type
+	if etyp == nil {
+		gothrow("runtime.SetFinalizer: first argument is nil")
+	}
+	if etyp.kind&kindMask != kindPtr {
+		gothrow("runtime.SetFinalizer: first argument is " + *etyp._string + ", not pointer")
+	}
+	ot := (*ptrtype)(unsafe.Pointer(etyp))
+	if ot.elem == nil {
+		gothrow("nil elem type!")
+	}
+
+	// find the containing object
+	_, base, _ := findObject(e.data)
+
+	if base == nil {
+		// 0-length objects are okay.
+		if e.data == unsafe.Pointer(&zerobase) {
+			return
+		}
+
+		// Global initializers might be linker-allocated.
+		//	var Foo = &Object{}
+		//	func main() {
+		//		runtime.SetFinalizer(Foo, nil)
+		//	}
+		// The relevant segments are: noptrdata, data, bss, noptrbss.
+		// We cannot assume they are in any order or even contiguous,
+		// due to external linking.
+		if uintptr(unsafe.Pointer(&noptrdata)) <= uintptr(e.data) && uintptr(e.data) < uintptr(unsafe.Pointer(&enoptrdata)) ||
+			uintptr(unsafe.Pointer(&data)) <= uintptr(e.data) && uintptr(e.data) < uintptr(unsafe.Pointer(&edata)) ||
+			uintptr(unsafe.Pointer(&bss)) <= uintptr(e.data) && uintptr(e.data) < uintptr(unsafe.Pointer(&ebss)) ||
+			uintptr(unsafe.Pointer(&noptrbss)) <= uintptr(e.data) && uintptr(e.data) < uintptr(unsafe.Pointer(&enoptrbss)) {
+			return
+		}
+		gothrow("runtime.SetFinalizer: pointer not in allocated block")
+	}
+
+	if e.data != base {
+		// As an implementation detail we allow to set finalizers for an inner byte
+		// of an object if it could come from tiny alloc (see mallocgc for details).
+		if ot.elem == nil || ot.elem.kind&kindNoPointers == 0 || ot.elem.size >= maxTinySize {
+			gothrow("runtime.SetFinalizer: pointer not at beginning of allocated block")
+		}
+	}
+
+	f := (*eface)(unsafe.Pointer(&finalizer))
+	ftyp := f._type
+	if ftyp == nil {
+		// switch to M stack and remove finalizer
+		mp := acquirem()
+		mp.ptrarg[0] = e.data
+		onM(removeFinalizer_m)
+		releasem(mp)
+		return
+	}
+
+	if ftyp.kind&kindMask != kindFunc {
+		gothrow("runtime.SetFinalizer: second argument is " + *ftyp._string + ", not a function")
+	}
+	ft := (*functype)(unsafe.Pointer(ftyp))
+	ins := *(*[]*_type)(unsafe.Pointer(&ft.in))
+	if ft.dotdotdot || len(ins) != 1 {
+		gothrow("runtime.SetFinalizer: cannot pass " + *etyp._string + " to finalizer " + *ftyp._string)
+	}
+	fint := ins[0]
+	switch {
+	case fint == etyp:
+		// ok - same type
+		goto okarg
+	case fint.kind&kindMask == kindPtr:
+		if (fint.x == nil || fint.x.name == nil || etyp.x == nil || etyp.x.name == nil) && (*ptrtype)(unsafe.Pointer(fint)).elem == ot.elem {
+			// ok - not same type, but both pointers,
+			// one or the other is unnamed, and same element type, so assignable.
+			goto okarg
+		}
+	case fint.kind&kindMask == kindInterface:
+		ityp := (*interfacetype)(unsafe.Pointer(fint))
+		if len(ityp.mhdr) == 0 {
+			// ok - satisfies empty interface
+			goto okarg
+		}
+		if _, ok := assertE2I2(ityp, obj); ok {
+			goto okarg
+		}
+	}
+	gothrow("runtime.SetFinalizer: cannot pass " + *etyp._string + " to finalizer " + *ftyp._string)
+okarg:
+	// compute size needed for return parameters
+	nret := uintptr(0)
+	for _, t := range *(*[]*_type)(unsafe.Pointer(&ft.out)) {
+		nret = round(nret, uintptr(t.align)) + uintptr(t.size)
+	}
+	nret = round(nret, ptrSize)
+
+	// make sure we have a finalizer goroutine
+	createfing()
+
+	// switch to M stack to add finalizer record
+	mp := acquirem()
+	mp.ptrarg[0] = f.data
+	mp.ptrarg[1] = e.data
+	mp.scalararg[0] = nret
+	mp.ptrarg[2] = unsafe.Pointer(fint)
+	mp.ptrarg[3] = unsafe.Pointer(ot)
+	onM(setFinalizer_m)
+	if mp.scalararg[0] != 1 {
+		gothrow("runtime.SetFinalizer: finalizer already set")
+	}
+	releasem(mp)
+}
+
+// round n up to a multiple of a.  a must be a power of 2.
+func round(n, a uintptr) uintptr {
+	return (n + a - 1) &^ (a - 1)
+}
+
+// Look up pointer v in heap.  Return the span containing the object,
+// the start of the object, and the size of the object.  If the object
+// does not exist, return nil, nil, 0.
+func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
+	c := gomcache()
+	c.local_nlookup++
+	if ptrSize == 4 && c.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(c)
+		unlock(&mheap_.lock)
+	}
+
+	// find span
+	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
+	arena_used := uintptr(unsafe.Pointer(mheap_.arena_used))
+	if uintptr(v) < arena_start || uintptr(v) >= arena_used {
+		return
+	}
+	p := uintptr(v) >> pageShift
+	q := p - arena_start>>pageShift
+	s = *(**mspan)(add(unsafe.Pointer(mheap_.spans), q*ptrSize))
+	if s == nil {
+		return
+	}
+	x = unsafe.Pointer(uintptr(s.start) << pageShift)
+
+	if uintptr(v) < uintptr(x) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != mSpanInUse {
+		s = nil
+		x = nil
+		return
+	}
+
+	n = uintptr(s.elemsize)
+	if s.sizeclass != 0 {
+		x = add(x, (uintptr(v)-uintptr(x))/n*n)
+	}
+	return
+}
+
+var fingCreate uint32
+
+func createfing() {
+	// start the finalizer goroutine exactly once
+	if fingCreate == 0 && cas(&fingCreate, 0, 1) {
+		go runfinq()
+	}
+}
+
+// This is the goroutine that runs all of the finalizers
+func runfinq() {
+	var (
+		frame    unsafe.Pointer
+		framecap uintptr
+	)
+
+	for {
+		lock(&finlock)
+		fb := finq
+		finq = nil
+		if fb == nil {
+			gp := getg()
+			fing = gp
+			fingwait = true
+			gp.issystem = true
+			goparkunlock(&finlock, "finalizer wait")
+			gp.issystem = false
+			continue
+		}
+		unlock(&finlock)
+		if raceenabled {
+			racefingo()
+		}
+		for fb != nil {
+			for i := int32(0); i < fb.cnt; i++ {
+				f := (*finalizer)(add(unsafe.Pointer(&fb.fin), uintptr(i)*unsafe.Sizeof(finalizer{})))
+
+				framesz := unsafe.Sizeof((interface{})(nil)) + uintptr(f.nret)
+				if framecap < framesz {
+					// The frame does not contain pointers interesting for GC,
+					// all not yet finalized objects are stored in finq.
+					// If we do not mark it as FlagNoScan,
+					// the last finalized object is not collected.
+					frame = mallocgc(framesz, nil, flagNoScan)
+					framecap = framesz
+				}
+
+				if f.fint == nil {
+					gothrow("missing type in runfinq")
+				}
+				switch f.fint.kind & kindMask {
+				case kindPtr:
+					// direct use of pointer
+					*(*unsafe.Pointer)(frame) = f.arg
+				case kindInterface:
+					ityp := (*interfacetype)(unsafe.Pointer(f.fint))
+					// set up with empty interface
+					(*eface)(frame)._type = &f.ot.typ
+					(*eface)(frame).data = f.arg
+					if len(ityp.mhdr) != 0 {
+						// convert to interface with methods
+						// this conversion is guaranteed to succeed - we checked in SetFinalizer
+						*(*fInterface)(frame) = assertE2I(ityp, *(*interface{})(frame))
+					}
+				default:
+					gothrow("bad kind in runfinq")
+				}
+				reflectcall(unsafe.Pointer(f.fn), frame, uint32(framesz), uint32(framesz))
+
+				// drop finalizer queue references to finalized object
+				f.fn = nil
+				f.arg = nil
+				f.ot = nil
+			}
+			fb.cnt = 0
+			next := fb.next
+			lock(&finlock)
+			fb.next = finc
+			finc = fb
+			unlock(&finlock)
+			fb = next
+		}
+	}
+}
+
+var persistent struct {
+	lock mutex
+	pos  unsafe.Pointer
+	end  unsafe.Pointer
+}
+
+// Wrapper around sysAlloc that can allocate small chunks.
+// There is no associated free operation.
+// Intended for things like function/type/debug-related persistent data.
+// If align is 0, uses default align (currently 8).
+func persistentalloc(size, align uintptr, stat *uint64) unsafe.Pointer {
+	const (
+		chunk    = 256 << 10
+		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
+	)
+
+	if align != 0 {
+		if align&(align-1) != 0 {
+			gothrow("persistentalloc: align is not a power of 2")
+		}
+		if align > _PageSize {
+			gothrow("persistentalloc: align is too large")
+		}
+	} else {
+		align = 8
+	}
+
+	if size >= maxBlock {
+		return sysAlloc(size, stat)
+	}
+
+	lock(&persistent.lock)
+	persistent.pos = roundup(persistent.pos, align)
+	if uintptr(persistent.pos)+size > uintptr(persistent.end) {
+		persistent.pos = sysAlloc(chunk, &memstats.other_sys)
+		if persistent.pos == nil {
+			unlock(&persistent.lock)
+			gothrow("runtime: cannot allocate memory")
+		}
+		persistent.end = add(persistent.pos, chunk)
+	}
+	p := persistent.pos
+	persistent.pos = add(persistent.pos, size)
+	unlock(&persistent.lock)
+
+	if stat != &memstats.other_sys {
+		xadd64(stat, int64(size))
+		xadd64(&memstats.other_sys, -int64(size))
+	}
+	return p
+}
diff --git a/src/runtime/malloc.h b/src/runtime/malloc.h
new file mode 100644
index 000000000..adb8d3d67
--- /dev/null
+++ b/src/runtime/malloc.h
@@ -0,0 +1,621 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Memory allocator, based on tcmalloc.
+// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+
+// The main allocator works in runs of pages.
+// Small allocation sizes (up to and including 32 kB) are
+// rounded to one of about 100 size classes, each of which
+// has its own free list of objects of exactly that size.
+// Any free page of memory can be split into a set of objects
+// of one size class, which are then managed using free list
+// allocators.
+//
+// The allocator's data structures are:
+//
+//	FixAlloc: a free-list allocator for fixed-size objects,
+//		used to manage storage used by the allocator.
+//	MHeap: the malloc heap, managed at page (4096-byte) granularity.
+//	MSpan: a run of pages managed by the MHeap.
+//	MCentral: a shared free list for a given size class.
+//	MCache: a per-thread (in Go, per-P) cache for small objects.
+//	MStats: allocation statistics.
+//
+// Allocating a small object proceeds up a hierarchy of caches:
+//
+//	1. Round the size up to one of the small size classes
+//	   and look in the corresponding MCache free list.
+//	   If the list is not empty, allocate an object from it.
+//	   This can all be done without acquiring a lock.
+//
+//	2. If the MCache free list is empty, replenish it by
+//	   taking a bunch of objects from the MCentral free list.
+//	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
+//
+//	3. If the MCentral free list is empty, replenish it by
+//	   allocating a run of pages from the MHeap and then
+//	   chopping that memory into a objects of the given size.
+//	   Allocating many objects amortizes the cost of locking
+//	   the heap.
+//
+//	4. If the MHeap is empty or has no page runs large enough,
+//	   allocate a new group of pages (at least 1MB) from the
+//	   operating system.  Allocating a large run of pages
+//	   amortizes the cost of talking to the operating system.
+//
+// Freeing a small object proceeds up the same hierarchy:
+//
+//	1. Look up the size class for the object and add it to
+//	   the MCache free list.
+//
+//	2. If the MCache free list is too long or the MCache has
+//	   too much memory, return some to the MCentral free lists.
+//
+//	3. If all the objects in a given span have returned to
+//	   the MCentral list, return that span to the page heap.
+//
+//	4. If the heap has too much memory, return some to the
+//	   operating system.
+//
+//	TODO(rsc): Step 4 is not implemented.
+//
+// Allocating and freeing a large object uses the page heap
+// directly, bypassing the MCache and MCentral free lists.
+//
+// The small objects on the MCache and MCentral free lists
+// may or may not be zeroed.  They are zeroed if and only if
+// the second word of the object is zero.  A span in the
+// page heap is zeroed unless s->needzero is set. When a span
+// is allocated to break into small objects, it is zeroed if needed
+// and s->needzero is set. There are two main benefits to delaying the
+// zeroing this way:
+//
+//	1. stack frames allocated from the small object lists
+//	   or the page heap can avoid zeroing altogether.
+//	2. the cost of zeroing when reusing a small object is
+//	   charged to the mutator, not the garbage collector.
+//
+// This C code was written with an eye toward translating to Go
+// in the future.  Methods have the form Type_Method(Type *t, ...).
+
+typedef struct MCentral	MCentral;
+typedef struct MHeap	MHeap;
+typedef struct MSpan	MSpan;
+typedef struct MStats	MStats;
+typedef struct MLink	MLink;
+typedef struct GCStats	GCStats;
+
+enum
+{
+	PageShift	= 13,
+	PageSize	= 1<<PageShift,
+	PageMask	= PageSize - 1,
+};
+typedef	uintptr	pageID;		// address >> PageShift
+
+enum
+{
+	// Computed constant.  The definition of MaxSmallSize and the
+	// algorithm in msize.c produce some number of different allocation
+	// size classes.  NumSizeClasses is that number.  It's needed here
+	// because there are static arrays of this length; when msize runs its
+	// size choosing algorithm it double-checks that NumSizeClasses agrees.
+	NumSizeClasses = 67,
+
+	// Tunable constants.
+	MaxSmallSize = 32<<10,
+
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
+	TinySize = 16,
+	TinySizeClass = 2,
+
+	FixAllocChunk = 16<<10,		// Chunk size for FixAlloc
+	MaxMHeapList = 1<<(20 - PageShift),	// Maximum page length for fixed-size list in MHeap.
+	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
+
+	// Per-P, per order stack segment cache size.
+	StackCacheSize = 32*1024,
+	// Number of orders that get caching.  Order 0 is FixedStack
+	// and each successive order is twice as large.
+	NumStackOrders = 3,
+
+	// Number of bits in page to span calculations (4k pages).
+	// On Windows 64-bit we limit the arena to 32GB or 35 bits (see below for reason).
+	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
+	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+#ifdef _64BIT
+#ifdef GOOS_windows
+	// Windows counts memory used by page table into committed memory
+	// of the process, so we can't reserve too much memory.
+	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
+	MHeapMap_Bits = 35 - PageShift,
+#else
+	MHeapMap_Bits = 37 - PageShift,
+#endif
+#else
+	MHeapMap_Bits = 32 - PageShift,
+#endif
+
+	// Max number of threads to run garbage collection.
+	// 2, 3, and 4 are all plausible maximums depending
+	// on the hardware details of the machine.  The garbage
+	// collector scales well to 32 cpus.
+	MaxGcproc = 32,
+};
+
+// Maximum memory allocation size, a hint for callers.
+// This must be a #define instead of an enum because it
+// is so large.
+#ifdef _64BIT
+#define	MaxMem	(1ULL<<(MHeapMap_Bits+PageShift))	/* 128 GB or 32 GB */
+#else
+#define	MaxMem	((uintptr)-1)
+#endif
+
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+struct MLink
+{
+	MLink *next;
+};
+
+// sysAlloc obtains a large chunk of zeroed memory from the
+// operating system, typically on the order of a hundred kilobytes
+// or a megabyte.
+// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysUnused notifies the operating system that the contents
+// of the memory region are no longer needed and can be reused
+// for other purposes.
+// SysUsed notifies the operating system that the contents
+// of the memory region are needed again.
+//
+// SysFree returns it unconditionally; this is only used if
+// an out-of-memory error has been detected midway through
+// an allocation.  It is okay if SysFree is a no-op.
+//
+// SysReserve reserves address space without allocating memory.
+// If the pointer passed to it is non-nil, the caller wants the
+// reservation there, but SysReserve can still choose another
+// location if that one is unavailable.  On some systems and in some
+// cases SysReserve will simply check that the address space is
+// available and not actually reserve it.  If SysReserve returns
+// non-nil, it sets *reserved to true if the address space is
+// reserved, false if it has merely been checked.
+// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysMap maps previously reserved address space for use.
+// The reserved argument is true if the address space was really
+// reserved, not merely checked.
+//
+// SysFault marks a (already sysAlloc'd) region to fault
+// if accessed.  Used only for debugging the runtime.
+
+void*	runtime·sysAlloc(uintptr nbytes, uint64 *stat);
+void	runtime·SysFree(void *v, uintptr nbytes, uint64 *stat);
+void	runtime·SysUnused(void *v, uintptr nbytes);
+void	runtime·SysUsed(void *v, uintptr nbytes);
+void	runtime·SysMap(void *v, uintptr nbytes, bool reserved, uint64 *stat);
+void*	runtime·SysReserve(void *v, uintptr nbytes, bool *reserved);
+void	runtime·SysFault(void *v, uintptr nbytes);
+
+// FixAlloc is a simple free-list allocator for fixed size objects.
+// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+// MCache and MSpan objects.
+//
+// Memory returned by FixAlloc_Alloc is not zeroed.
+// The caller is responsible for locking around FixAlloc calls.
+// Callers can keep state in the object but the first word is
+// smashed by freeing and reallocating.
+struct FixAlloc
+{
+	uintptr	size;
+	void	(*first)(void *arg, byte *p);	// called first time p is returned
+	void*	arg;
+	MLink*	list;
+	byte*	chunk;
+	uint32	nchunk;
+	uintptr	inuse;	// in-use bytes now
+	uint64*	stat;
+};
+
+void	runtime·FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat);
+void*	runtime·FixAlloc_Alloc(FixAlloc *f);
+void	runtime·FixAlloc_Free(FixAlloc *f, void *p);
+
+
+// Statistics.
+// Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
+struct MStats
+{
+	// General statistics.
+	uint64	alloc;		// bytes allocated and still in use
+	uint64	total_alloc;	// bytes allocated (even if freed)
+	uint64	sys;		// bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
+	uint64	nlookup;	// number of pointer lookups
+	uint64	nmalloc;	// number of mallocs
+	uint64	nfree;  // number of frees
+
+	// Statistics about malloc heap.
+	// protected by mheap.lock
+	uint64	heap_alloc;	// bytes allocated and still in use
+	uint64	heap_sys;	// bytes obtained from system
+	uint64	heap_idle;	// bytes in idle spans
+	uint64	heap_inuse;	// bytes in non-idle spans
+	uint64	heap_released;	// bytes released to the OS
+	uint64	heap_objects;	// total number of allocated objects
+
+	// Statistics about allocation of low-level fixed-size structures.
+	// Protected by FixAlloc locks.
+	uint64	stacks_inuse;	// this number is included in heap_inuse above
+	uint64	stacks_sys;	// always 0 in mstats
+	uint64	mspan_inuse;	// MSpan structures
+	uint64	mspan_sys;
+	uint64	mcache_inuse;	// MCache structures
+	uint64	mcache_sys;
+	uint64	buckhash_sys;	// profiling bucket hash table
+	uint64	gc_sys;
+	uint64	other_sys;
+
+	// Statistics about garbage collector.
+	// Protected by mheap or stopping the world during GC.
+	uint64	next_gc;	// next GC (in heap_alloc time)
+	uint64  last_gc;	// last GC (in absolute time)
+	uint64	pause_total_ns;
+	uint64	pause_ns[256];  // circular buffer of recent GC pause lengths
+	uint64	pause_end[256]; // circular buffer of recent GC end times (nanoseconds since 1970)
+	uint32	numgc;
+	bool	enablegc;
+	bool	debuggc;
+
+	// Statistics about allocation size classes.
+	
+	struct MStatsBySize {
+		uint32 size;
+		uint64 nmalloc;
+		uint64 nfree;
+	} by_size[NumSizeClasses];
+	
+	uint64	tinyallocs;	// number of tiny allocations that didn't cause actual allocation; not exported to Go directly
+};
+
+
+#define mstats runtime·memstats
+extern MStats mstats;
+void	runtime·updatememstats(GCStats *stats);
+void	runtime·ReadMemStats(MStats *stats);
+
+// Size classes.  Computed and initialized by InitSizes.
+//
+// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+//	1 <= sizeclass < NumSizeClasses, for n.
+//	Size class 0 is reserved to mean "not small".
+//
+// class_to_size[i] = largest size in class i
+// class_to_allocnpages[i] = number of pages to allocate when
+//	making new objects in class i
+
+int32	runtime·SizeToClass(int32);
+uintptr	runtime·roundupsize(uintptr);
+extern	int32	runtime·class_to_size[NumSizeClasses];
+extern	int32	runtime·class_to_allocnpages[NumSizeClasses];
+extern	int8	runtime·size_to_class8[1024/8 + 1];
+extern	int8	runtime·size_to_class128[(MaxSmallSize-1024)/128 + 1];
+extern	void	runtime·InitSizes(void);
+
+typedef struct MCacheList MCacheList;
+struct MCacheList
+{
+	MLink *list;
+	uint32 nlist;
+};
+
+typedef struct StackFreeList StackFreeList;
+struct StackFreeList
+{
+	MLink *list;  // linked list of free stacks
+	uintptr size; // total size of stacks in list
+};
+
+typedef struct SudoG SudoG;
+
+// Per-thread (in Go, per-P) cache for small objects.
+// No locking needed because it is per-thread (per-P).
+struct MCache
+{
+	// The following members are accessed on every malloc,
+	// so they are grouped here for better caching.
+	int32 next_sample;		// trigger heap sample after allocating this many bytes
+	intptr local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
+	// Allocator cache for tiny objects w/o pointers.
+	// See "Tiny allocator" comment in malloc.goc.
+	byte*	tiny;
+	uintptr	tinysize;
+	uintptr	local_tinyallocs;	// number of tiny allocs not counted in other stats
+	// The rest is not accessed on every malloc.
+	MSpan*	alloc[NumSizeClasses];	// spans to allocate from
+
+	StackFreeList stackcache[NumStackOrders];
+
+	SudoG*	sudogcache;
+
+	void*	gcworkbuf;
+
+	// Local allocator stats, flushed during GC.
+	uintptr local_nlookup;		// number of pointer lookups
+	uintptr local_largefree;	// bytes freed for large objects (>MaxSmallSize)
+	uintptr local_nlargefree;	// number of frees for large objects (>MaxSmallSize)
+	uintptr local_nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
+};
+
+MSpan*	runtime·MCache_Refill(MCache *c, int32 sizeclass);
+void	runtime·MCache_ReleaseAll(MCache *c);
+void	runtime·stackcache_clear(MCache *c);
+void	runtime·gcworkbuffree(void *b);
+
+enum
+{
+	KindSpecialFinalizer = 1,
+	KindSpecialProfile = 2,
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+};
+
+typedef struct Special Special;
+struct Special
+{
+	Special*	next;	// linked list in span
+	uint16		offset;	// span offset of object
+	byte		kind;	// kind of Special
+};
+
+// The described object has a finalizer set for it.
+typedef struct SpecialFinalizer SpecialFinalizer;
+struct SpecialFinalizer
+{
+	Special		special;
+	FuncVal*	fn;
+	uintptr		nret;
+	Type*		fint;
+	PtrType*	ot;
+};
+
+// The described object is being heap profiled.
+typedef struct Bucket Bucket; // from mprof.h
+typedef struct SpecialProfile SpecialProfile;
+struct SpecialProfile
+{
+	Special	special;
+	Bucket*	b;
+};
+
+// An MSpan is a run of pages.
+enum
+{
+	MSpanInUse = 0, // allocated for garbage collected heap
+	MSpanStack,     // allocated for use by stack allocator
+	MSpanFree,
+	MSpanListHead,
+	MSpanDead,
+};
+struct MSpan
+{
+	MSpan	*next;		// in a span linked list
+	MSpan	*prev;		// in a span linked list
+	pageID	start;		// starting page number
+	uintptr	npages;		// number of pages in span
+	MLink	*freelist;	// list of free objects
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// h->sweepgen is incremented by 2 after every GC
+	uint32	sweepgen;
+	uint16	ref;		// capacity - number of objects in freelist
+	uint8	sizeclass;	// size class
+	bool	incache;	// being used by an MCache
+	uint8	state;		// MSpanInUse etc
+	uint8	needzero;	// needs to be zeroed before allocation
+	uintptr	elemsize;	// computed from sizeclass or from npages
+	int64   unusedsince;	// First time spotted by GC in MSpanFree state
+	uintptr npreleased;	// number of pages released to the OS
+	byte	*limit;		// end of data in span
+	Mutex	specialLock;	// guards specials list
+	Special	*specials;	// linked list of special records sorted by offset.
+};
+
+void	runtime·MSpan_Init(MSpan *span, pageID start, uintptr npages);
+void	runtime·MSpan_EnsureSwept(MSpan *span);
+bool	runtime·MSpan_Sweep(MSpan *span, bool preserve);
+
+// Every MSpan is in one doubly-linked list,
+// either one of the MHeap's free lists or one of the
+// MCentral's span lists.  We use empty MSpan structures as list heads.
+void	runtime·MSpanList_Init(MSpan *list);
+bool	runtime·MSpanList_IsEmpty(MSpan *list);
+void	runtime·MSpanList_Insert(MSpan *list, MSpan *span);
+void	runtime·MSpanList_InsertBack(MSpan *list, MSpan *span);
+void	runtime·MSpanList_Remove(MSpan *span);	// from whatever list it is in
+
+
+// Central list of free objects of a given size.
+struct MCentral
+{
+	Mutex  lock;
+	int32 sizeclass;
+	MSpan nonempty;	// list of spans with a free object
+	MSpan empty;	// list of spans with no free objects (or cached in an MCache)
+};
+
+void	runtime·MCentral_Init(MCentral *c, int32 sizeclass);
+MSpan*	runtime·MCentral_CacheSpan(MCentral *c);
+void	runtime·MCentral_UncacheSpan(MCentral *c, MSpan *s);
+bool	runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end, bool preserve);
+
+// Main malloc heap.
+// The heap itself is the "free[]" and "large" arrays,
+// but all the other global data is here too.
+struct MHeap
+{
+	Mutex  lock;
+	MSpan free[MaxMHeapList];	// free lists of given length
+	MSpan freelarge;		// free lists length >= MaxMHeapList
+	MSpan busy[MaxMHeapList];	// busy lists of large objects of given length
+	MSpan busylarge;		// busy lists of large objects length >= MaxMHeapList
+	MSpan **allspans;		// all spans out there
+	MSpan **gcspans;		// copy of allspans referenced by GC marker or sweeper
+	uint32	nspan;
+	uint32	nspancap;
+	uint32	sweepgen;		// sweep generation, see comment in MSpan
+	uint32	sweepdone;		// all spans are swept
+
+	// span lookup
+	MSpan**	spans;
+	uintptr	spans_mapped;
+
+	// range of addresses we might see in the heap
+	byte *bitmap;
+	uintptr bitmap_mapped;
+	byte *arena_start;
+	byte *arena_used;
+	byte *arena_end;
+	bool arena_reserved;
+
+	// central free lists for small size classes.
+	// the padding makes sure that the MCentrals are
+	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// gets its own cache line.
+	struct MHeapCentral {
+		MCentral mcentral;
+		byte pad[CacheLineSize];
+	} central[NumSizeClasses];
+
+	FixAlloc spanalloc;	// allocator for Span*
+	FixAlloc cachealloc;	// allocator for MCache*
+	FixAlloc specialfinalizeralloc;	// allocator for SpecialFinalizer*
+	FixAlloc specialprofilealloc;	// allocator for SpecialProfile*
+	Mutex speciallock; // lock for sepcial record allocators.
+
+	// Malloc stats.
+	uint64 largefree;	// bytes freed for large objects (>MaxSmallSize)
+	uint64 nlargefree;	// number of frees for large objects (>MaxSmallSize)
+	uint64 nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
+};
+#define runtime·mheap runtime·mheap_
+extern MHeap runtime·mheap;
+
+void	runtime·MHeap_Init(MHeap *h);
+MSpan*	runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero);
+MSpan*	runtime·MHeap_AllocStack(MHeap *h, uintptr npage);
+void	runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct);
+void	runtime·MHeap_FreeStack(MHeap *h, MSpan *s);
+MSpan*	runtime·MHeap_Lookup(MHeap *h, void *v);
+MSpan*	runtime·MHeap_LookupMaybe(MHeap *h, void *v);
+void*	runtime·MHeap_SysAlloc(MHeap *h, uintptr n);
+void	runtime·MHeap_MapBits(MHeap *h);
+void	runtime·MHeap_MapSpans(MHeap *h);
+void	runtime·MHeap_Scavenge(int32 k, uint64 now, uint64 limit);
+
+void*	runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
+int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
+uintptr	runtime·sweepone(void);
+void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
+void	runtime·unmarkspan(void *v, uintptr size);
+void	runtime·purgecachedstats(MCache*);
+void	runtime·tracealloc(void*, uintptr, Type*);
+void	runtime·tracefree(void*, uintptr);
+void	runtime·tracegc(void);
+
+int32	runtime·gcpercent;
+int32	runtime·readgogc(void);
+void	runtime·clearpools(void);
+
+enum
+{
+	// flags to malloc
+	FlagNoScan	= 1<<0,	// GC doesn't have to scan object
+	FlagNoZero	= 1<<1, // don't zero memory
+};
+
+void	runtime·mProf_Malloc(void*, uintptr);
+void	runtime·mProf_Free(Bucket*, uintptr, bool);
+void	runtime·mProf_GC(void);
+void	runtime·iterate_memprof(void (**callback)(Bucket*, uintptr, uintptr*, uintptr, uintptr, uintptr));
+int32	runtime·gcprocs(void);
+void	runtime·helpgc(int32 nproc);
+void	runtime·gchelper(void);
+void	runtime·createfing(void);
+G*	runtime·wakefing(void);
+void	runtime·getgcmask(byte*, Type*, byte**, uintptr*);
+
+// NOTE: Layout known to queuefinalizer.
+typedef struct Finalizer Finalizer;
+struct Finalizer
+{
+	FuncVal *fn;	// function to call
+	void *arg;	// ptr to object
+	uintptr nret;	// bytes of return values from fn
+	Type *fint;	// type of first argument of fn
+	PtrType *ot;	// type of ptr to object
+};
+
+typedef struct FinBlock FinBlock;
+struct FinBlock
+{
+	FinBlock *alllink;
+	FinBlock *next;
+	int32 cnt;
+	int32 cap;
+	Finalizer fin[1];
+};
+extern Mutex	runtime·finlock;	// protects the following variables
+extern G*	runtime·fing;
+extern bool	runtime·fingwait;
+extern bool	runtime·fingwake;
+extern FinBlock	*runtime·finq;		// list of finalizers that are to be executed
+extern FinBlock	*runtime·finc;		// cache of free blocks
+
+void	runtime·setprofilebucket_m(void);
+
+bool	runtime·addfinalizer(void*, FuncVal *fn, uintptr, Type*, PtrType*);
+void	runtime·removefinalizer(void*);
+void	runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot);
+bool	runtime·freespecial(Special *s, void *p, uintptr size, bool freed);
+
+// Information from the compiler about the layout of stack frames.
+struct BitVector
+{
+	int32 n; // # of bits
+	uint8 *bytedata;
+};
+typedef struct StackMap StackMap;
+struct StackMap
+{
+	int32 n; // number of bitmaps
+	int32 nbit; // number of bits in each bitmap
+	uint8 bytedata[]; // bitmaps, each starting on a 32-bit boundary
+};
+// Returns pointer map data for the given stackmap index
+// (the index is encoded in PCDATA_StackMapIndex).
+BitVector	runtime·stackmapdata(StackMap *stackmap, int32 n);
+
+extern	BitVector	runtime·gcdatamask;
+extern	BitVector	runtime·gcbssmask;
+
+// defined in mgc0.go
+void	runtime·gc_m_ptr(Eface*);
+void	runtime·gc_g_ptr(Eface*);
+void	runtime·gc_itab_ptr(Eface*);
+
+void  runtime·setgcpercent_m(void);
+
+// Value we use to mark dead pointers when GODEBUG=gcdead=1.
+#define PoisonGC ((uintptr)0xf969696969696969ULL)
+#define PoisonStack ((uintptr)0x6868686868686868ULL)
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
new file mode 100644
index 000000000..b7795aa1d
--- /dev/null
+++ b/src/runtime/malloc_test.go
@@ -0,0 +1,189 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"flag"
+	. "runtime"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func TestMemStats(t *testing.T) {
+	// Test that MemStats has sane values.
+	st := new(MemStats)
+	ReadMemStats(st)
+
+	// Everything except HeapReleased and HeapIdle, because they indeed can be 0.
+	if st.Alloc == 0 || st.TotalAlloc == 0 || st.Sys == 0 || st.Lookups == 0 ||
+		st.Mallocs == 0 || st.Frees == 0 || st.HeapAlloc == 0 || st.HeapSys == 0 ||
+		st.HeapInuse == 0 || st.HeapObjects == 0 || st.StackInuse == 0 ||
+		st.StackSys == 0 || st.MSpanInuse == 0 || st.MSpanSys == 0 || st.MCacheInuse == 0 ||
+		st.MCacheSys == 0 || st.BuckHashSys == 0 || st.GCSys == 0 || st.OtherSys == 0 ||
+		st.NextGC == 0 || st.NumGC == 0 {
+		t.Fatalf("Zero value: %+v", *st)
+	}
+
+	if st.Alloc > 1e10 || st.TotalAlloc > 1e11 || st.Sys > 1e10 || st.Lookups > 1e10 ||
+		st.Mallocs > 1e10 || st.Frees > 1e10 || st.HeapAlloc > 1e10 || st.HeapSys > 1e10 ||
+		st.HeapIdle > 1e10 || st.HeapInuse > 1e10 || st.HeapObjects > 1e10 || st.StackInuse > 1e10 ||
+		st.StackSys > 1e10 || st.MSpanInuse > 1e10 || st.MSpanSys > 1e10 || st.MCacheInuse > 1e10 ||
+		st.MCacheSys > 1e10 || st.BuckHashSys > 1e10 || st.GCSys > 1e10 || st.OtherSys > 1e10 ||
+		st.NextGC > 1e10 || st.NumGC > 1e9 {
+		t.Fatalf("Insanely high value (overflow?): %+v", *st)
+	}
+
+	if st.Sys != st.HeapSys+st.StackSys+st.MSpanSys+st.MCacheSys+
+		st.BuckHashSys+st.GCSys+st.OtherSys {
+		t.Fatalf("Bad sys value: %+v", *st)
+	}
+
+	if st.HeapIdle+st.HeapInuse != st.HeapSys {
+		t.Fatalf("HeapIdle(%d) + HeapInuse(%d) should be equal to HeapSys(%d), but isn't.", st.HeapIdle, st.HeapInuse, st.HeapSys)
+	}
+}
+
+var mallocSink uintptr
+
+func BenchmarkMalloc8(b *testing.B) {
+	var x uintptr
+	for i := 0; i < b.N; i++ {
+		p := new(int64)
+		x ^= uintptr(unsafe.Pointer(p))
+	}
+	mallocSink = x
+}
+
+func BenchmarkMalloc16(b *testing.B) {
+	var x uintptr
+	for i := 0; i < b.N; i++ {
+		p := new([2]int64)
+		x ^= uintptr(unsafe.Pointer(p))
+	}
+	mallocSink = x
+}
+
+func BenchmarkMallocTypeInfo8(b *testing.B) {
+	var x uintptr
+	for i := 0; i < b.N; i++ {
+		p := new(struct {
+			p [8 / unsafe.Sizeof(uintptr(0))]*int
+		})
+		x ^= uintptr(unsafe.Pointer(p))
+	}
+	mallocSink = x
+}
+
+func BenchmarkMallocTypeInfo16(b *testing.B) {
+	var x uintptr
+	for i := 0; i < b.N; i++ {
+		p := new(struct {
+			p [16 / unsafe.Sizeof(uintptr(0))]*int
+		})
+		x ^= uintptr(unsafe.Pointer(p))
+	}
+	mallocSink = x
+}
+
+type LargeStruct struct {
+	x [16][]byte
+}
+
+func BenchmarkMallocLargeStruct(b *testing.B) {
+	var x uintptr
+	for i := 0; i < b.N; i++ {
+		p := make([]LargeStruct, 2)
+		x ^= uintptr(unsafe.Pointer(&p[0]))
+	}
+	mallocSink = x
+}
+
+var n = flag.Int("n", 1000, "number of goroutines")
+
+func BenchmarkGoroutineSelect(b *testing.B) {
+	quit := make(chan struct{})
+	read := func(ch chan struct{}) {
+		for {
+			select {
+			case _, ok := <-ch:
+				if !ok {
+					return
+				}
+			case <-quit:
+				return
+			}
+		}
+	}
+	benchHelper(b, *n, read)
+}
+
+func BenchmarkGoroutineBlocking(b *testing.B) {
+	read := func(ch chan struct{}) {
+		for {
+			if _, ok := <-ch; !ok {
+				return
+			}
+		}
+	}
+	benchHelper(b, *n, read)
+}
+
+func BenchmarkGoroutineForRange(b *testing.B) {
+	read := func(ch chan struct{}) {
+		for range ch {
+		}
+	}
+	benchHelper(b, *n, read)
+}
+
+func benchHelper(b *testing.B, n int, read func(chan struct{})) {
+	m := make([]chan struct{}, n)
+	for i := range m {
+		m[i] = make(chan struct{}, 1)
+		go read(m[i])
+	}
+	b.StopTimer()
+	b.ResetTimer()
+	GC()
+
+	for i := 0; i < b.N; i++ {
+		for _, ch := range m {
+			if ch != nil {
+				ch <- struct{}{}
+			}
+		}
+		time.Sleep(10 * time.Millisecond)
+		b.StartTimer()
+		GC()
+		b.StopTimer()
+	}
+
+	for _, ch := range m {
+		close(ch)
+	}
+	time.Sleep(10 * time.Millisecond)
+}
+
+func BenchmarkGoroutineIdle(b *testing.B) {
+	quit := make(chan struct{})
+	fn := func() {
+		<-quit
+	}
+	for i := 0; i < *n; i++ {
+		go fn()
+	}
+
+	GC()
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		GC()
+	}
+
+	b.StopTimer()
+	close(quit)
+	time.Sleep(10 * time.Millisecond)
+}
diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
new file mode 100644
index 000000000..92da2d820
--- /dev/null
+++ b/src/runtime/map_test.go
@@ -0,0 +1,537 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math"
+	"reflect"
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+	"testing"
+)
+
+// negative zero is a good test because:
+//  1) 0 and -0 are equal, yet have distinct representations.
+//  2) 0 is represented as all zeros, -0 isn't.
+// I'm not sure the language spec actually requires this behavior,
+// but it's what the current map implementation does.
+func TestNegativeZero(t *testing.T) {
+	m := make(map[float64]bool, 0)
+
+	m[+0.0] = true
+	m[math.Copysign(0.0, -1.0)] = true // should overwrite +0 entry
+
+	if len(m) != 1 {
+		t.Error("length wrong")
+	}
+
+	for k := range m {
+		if math.Copysign(1.0, k) > 0 {
+			t.Error("wrong sign")
+		}
+	}
+
+	m = make(map[float64]bool, 0)
+	m[math.Copysign(0.0, -1.0)] = true
+	m[+0.0] = true // should overwrite -0.0 entry
+
+	if len(m) != 1 {
+		t.Error("length wrong")
+	}
+
+	for k := range m {
+		if math.Copysign(1.0, k) < 0 {
+			t.Error("wrong sign")
+		}
+	}
+}
+
+// nan is a good test because nan != nan, and nan has
+// a randomized hash value.
+func TestNan(t *testing.T) {
+	m := make(map[float64]int, 0)
+	nan := math.NaN()
+	m[nan] = 1
+	m[nan] = 2
+	m[nan] = 4
+	if len(m) != 3 {
+		t.Error("length wrong")
+	}
+	s := 0
+	for k, v := range m {
+		if k == k {
+			t.Error("nan disappeared")
+		}
+		if (v & (v - 1)) != 0 {
+			t.Error("value wrong")
+		}
+		s |= v
+	}
+	if s != 7 {
+		t.Error("values wrong")
+	}
+}
+
+// Maps aren't actually copied on assignment.
+func TestAlias(t *testing.T) {
+	m := make(map[int]int, 0)
+	m[0] = 5
+	n := m
+	n[0] = 6
+	if m[0] != 6 {
+		t.Error("alias didn't work")
+	}
+}
+
+func TestGrowWithNaN(t *testing.T) {
+	m := make(map[float64]int, 4)
+	nan := math.NaN()
+	m[nan] = 1
+	m[nan] = 2
+	m[nan] = 4
+	cnt := 0
+	s := 0
+	growflag := true
+	for k, v := range m {
+		if growflag {
+			// force a hashtable resize
+			for i := 0; i < 100; i++ {
+				m[float64(i)] = i
+			}
+			growflag = false
+		}
+		if k != k {
+			cnt++
+			s |= v
+		}
+	}
+	if cnt != 3 {
+		t.Error("NaN keys lost during grow")
+	}
+	if s != 7 {
+		t.Error("NaN values lost during grow")
+	}
+}
+
+type FloatInt struct {
+	x float64
+	y int
+}
+
+func TestGrowWithNegativeZero(t *testing.T) {
+	negzero := math.Copysign(0.0, -1.0)
+	m := make(map[FloatInt]int, 4)
+	m[FloatInt{0.0, 0}] = 1
+	m[FloatInt{0.0, 1}] = 2
+	m[FloatInt{0.0, 2}] = 4
+	m[FloatInt{0.0, 3}] = 8
+	growflag := true
+	s := 0
+	cnt := 0
+	negcnt := 0
+	// The first iteration should return the +0 key.
+	// The subsequent iterations should return the -0 key.
+	// I'm not really sure this is required by the spec,
+	// but it makes sense.
+	// TODO: are we allowed to get the first entry returned again???
+	for k, v := range m {
+		if v == 0 {
+			continue
+		} // ignore entries added to grow table
+		cnt++
+		if math.Copysign(1.0, k.x) < 0 {
+			if v&16 == 0 {
+				t.Error("key/value not updated together 1")
+			}
+			negcnt++
+			s |= v & 15
+		} else {
+			if v&16 == 16 {
+				t.Error("key/value not updated together 2", k, v)
+			}
+			s |= v
+		}
+		if growflag {
+			// force a hashtable resize
+			for i := 0; i < 100; i++ {
+				m[FloatInt{3.0, i}] = 0
+			}
+			// then change all the entries
+			// to negative zero
+			m[FloatInt{negzero, 0}] = 1 | 16
+			m[FloatInt{negzero, 1}] = 2 | 16
+			m[FloatInt{negzero, 2}] = 4 | 16
+			m[FloatInt{negzero, 3}] = 8 | 16
+			growflag = false
+		}
+	}
+	if s != 15 {
+		t.Error("entry missing", s)
+	}
+	if cnt != 4 {
+		t.Error("wrong number of entries returned by iterator", cnt)
+	}
+	if negcnt != 3 {
+		t.Error("update to negzero missed by iteration", negcnt)
+	}
+}
+
+func TestIterGrowAndDelete(t *testing.T) {
+	m := make(map[int]int, 4)
+	for i := 0; i < 100; i++ {
+		m[i] = i
+	}
+	growflag := true
+	for k := range m {
+		if growflag {
+			// grow the table
+			for i := 100; i < 1000; i++ {
+				m[i] = i
+			}
+			// delete all odd keys
+			for i := 1; i < 1000; i += 2 {
+				delete(m, i)
+			}
+			growflag = false
+		} else {
+			if k&1 == 1 {
+				t.Error("odd value returned")
+			}
+		}
+	}
+}
+
+// make sure old bucket arrays don't get GCd while
+// an iterator is still using them.
+func TestIterGrowWithGC(t *testing.T) {
+	m := make(map[int]int, 4)
+	for i := 0; i < 16; i++ {
+		m[i] = i
+	}
+	growflag := true
+	bitmask := 0
+	for k := range m {
+		if k < 16 {
+			bitmask |= 1 << uint(k)
+		}
+		if growflag {
+			// grow the table
+			for i := 100; i < 1000; i++ {
+				m[i] = i
+			}
+			// trigger a gc
+			runtime.GC()
+			growflag = false
+		}
+	}
+	if bitmask != 1<<16-1 {
+		t.Error("missing key", bitmask)
+	}
+}
+
+func testConcurrentReadsAfterGrowth(t *testing.T, useReflect bool) {
+	if runtime.GOMAXPROCS(-1) == 1 {
+		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(16))
+	}
+	numLoop := 10
+	numGrowStep := 250
+	numReader := 16
+	if testing.Short() {
+		numLoop, numGrowStep = 2, 500
+	}
+	for i := 0; i < numLoop; i++ {
+		m := make(map[int]int, 0)
+		for gs := 0; gs < numGrowStep; gs++ {
+			m[gs] = gs
+			var wg sync.WaitGroup
+			wg.Add(numReader * 2)
+			for nr := 0; nr < numReader; nr++ {
+				go func() {
+					defer wg.Done()
+					for range m {
+					}
+				}()
+				go func() {
+					defer wg.Done()
+					for key := 0; key < gs; key++ {
+						_ = m[key]
+					}
+				}()
+				if useReflect {
+					wg.Add(1)
+					go func() {
+						defer wg.Done()
+						mv := reflect.ValueOf(m)
+						keys := mv.MapKeys()
+						for _, k := range keys {
+							mv.MapIndex(k)
+						}
+					}()
+				}
+			}
+			wg.Wait()
+		}
+	}
+}
+
+func TestConcurrentReadsAfterGrowth(t *testing.T) {
+	testConcurrentReadsAfterGrowth(t, false)
+}
+
+func TestConcurrentReadsAfterGrowthReflect(t *testing.T) {
+	testConcurrentReadsAfterGrowth(t, true)
+}
+
+func TestBigItems(t *testing.T) {
+	var key [256]string
+	for i := 0; i < 256; i++ {
+		key[i] = "foo"
+	}
+	m := make(map[[256]string][256]string, 4)
+	for i := 0; i < 100; i++ {
+		key[37] = fmt.Sprintf("string%02d", i)
+		m[key] = key
+	}
+	var keys [100]string
+	var values [100]string
+	i := 0
+	for k, v := range m {
+		keys[i] = k[37]
+		values[i] = v[37]
+		i++
+	}
+	sort.Strings(keys[:])
+	sort.Strings(values[:])
+	for i := 0; i < 100; i++ {
+		if keys[i] != fmt.Sprintf("string%02d", i) {
+			t.Errorf("#%d: missing key: %v", i, keys[i])
+		}
+		if values[i] != fmt.Sprintf("string%02d", i) {
+			t.Errorf("#%d: missing value: %v", i, values[i])
+		}
+	}
+}
+
+type empty struct {
+}
+
+func TestEmptyKeyAndValue(t *testing.T) {
+	a := make(map[int]empty, 4)
+	b := make(map[empty]int, 4)
+	c := make(map[empty]empty, 4)
+	a[0] = empty{}
+	b[empty{}] = 0
+	b[empty{}] = 1
+	c[empty{}] = empty{}
+
+	if len(a) != 1 {
+		t.Errorf("empty value insert problem")
+	}
+	if b[empty{}] != 1 {
+		t.Errorf("empty key returned wrong value")
+	}
+}
+
+// Tests a map with a single bucket, with same-lengthed short keys
+// ("quick keys") as well as long keys.
+func TestSingleBucketMapStringKeys_DupLen(t *testing.T) {
+	testMapLookups(t, map[string]string{
+		"x":    "x1val",
+		"xx":   "x2val",
+		"foo":  "fooval",
+		"bar":  "barval", // same key length as "foo"
+		"xxxx": "x4val",
+		strings.Repeat("x", 128): "longval1",
+		strings.Repeat("y", 128): "longval2",
+	})
+}
+
+// Tests a map with a single bucket, with all keys having different lengths.
+func TestSingleBucketMapStringKeys_NoDupLen(t *testing.T) {
+	testMapLookups(t, map[string]string{
+		"x":                      "x1val",
+		"xx":                     "x2val",
+		"foo":                    "fooval",
+		"xxxx":                   "x4val",
+		"xxxxx":                  "x5val",
+		"xxxxxx":                 "x6val",
+		strings.Repeat("x", 128): "longval",
+	})
+}
+
+func testMapLookups(t *testing.T, m map[string]string) {
+	for k, v := range m {
+		if m[k] != v {
+			t.Fatalf("m[%q] = %q; want %q", k, m[k], v)
+		}
+	}
+}
+
+// Tests whether the iterator returns the right elements when
+// started in the middle of a grow, when the keys are NaNs.
+func TestMapNanGrowIterator(t *testing.T) {
+	m := make(map[float64]int)
+	nan := math.NaN()
+	const nBuckets = 16
+	// To fill nBuckets buckets takes LOAD * nBuckets keys.
+	nKeys := int(nBuckets * *runtime.HashLoad)
+
+	// Get map to full point with nan keys.
+	for i := 0; i < nKeys; i++ {
+		m[nan] = i
+	}
+	// Trigger grow
+	m[1.0] = 1
+	delete(m, 1.0)
+
+	// Run iterator
+	found := make(map[int]struct{})
+	for _, v := range m {
+		if v != -1 {
+			if _, repeat := found[v]; repeat {
+				t.Fatalf("repeat of value %d", v)
+			}
+			found[v] = struct{}{}
+		}
+		if len(found) == nKeys/2 {
+			// Halfway through iteration, finish grow.
+			for i := 0; i < nBuckets; i++ {
+				delete(m, 1.0)
+			}
+		}
+	}
+	if len(found) != nKeys {
+		t.Fatalf("missing value")
+	}
+}
+
+func TestMapIterOrder(t *testing.T) {
+	for _, n := range [...]int{3, 7, 9, 15} {
+		for i := 0; i < 1000; i++ {
+			// Make m be {0: true, 1: true, ..., n-1: true}.
+			m := make(map[int]bool)
+			for i := 0; i < n; i++ {
+				m[i] = true
+			}
+			// Check that iterating over the map produces at least two different orderings.
+			ord := func() []int {
+				var s []int
+				for key := range m {
+					s = append(s, key)
+				}
+				return s
+			}
+			first := ord()
+			ok := false
+			for try := 0; try < 100; try++ {
+				if !reflect.DeepEqual(first, ord()) {
+					ok = true
+					break
+				}
+			}
+			if !ok {
+				t.Errorf("Map with n=%d elements had consistent iteration order: %v", n, first)
+				break
+			}
+		}
+	}
+}
+
+// Issue 8410
+func TestMapSparseIterOrder(t *testing.T) {
+	// Run several rounds to increase the probability
+	// of failure. One is not enough.
+NextRound:
+	for round := 0; round < 10; round++ {
+		m := make(map[int]bool)
+		// Add 1000 items, remove 980.
+		for i := 0; i < 1000; i++ {
+			m[i] = true
+		}
+		for i := 20; i < 1000; i++ {
+			delete(m, i)
+		}
+
+		var first []int
+		for i := range m {
+			first = append(first, i)
+		}
+
+		// 800 chances to get a different iteration order.
+		// See bug 8736 for why we need so many tries.
+		for n := 0; n < 800; n++ {
+			idx := 0
+			for i := range m {
+				if i != first[idx] {
+					// iteration order changed.
+					continue NextRound
+				}
+				idx++
+			}
+		}
+		t.Fatalf("constant iteration order on round %d: %v", round, first)
+	}
+}
+
+func TestMapStringBytesLookup(t *testing.T) {
+	// Use large string keys to avoid small-allocation coalescing,
+	// which can cause AllocsPerRun to report lower counts than it should.
+	m := map[string]int{
+		"1000000000000000000000000000000000000000000000000": 1,
+		"2000000000000000000000000000000000000000000000000": 2,
+	}
+	buf := []byte("1000000000000000000000000000000000000000000000000")
+	if x := m[string(buf)]; x != 1 {
+		t.Errorf(`m[string([]byte("1"))] = %d, want 1`, x)
+	}
+	buf[0] = '2'
+	if x := m[string(buf)]; x != 2 {
+		t.Errorf(`m[string([]byte("2"))] = %d, want 2`, x)
+	}
+
+	var x int
+	n := testing.AllocsPerRun(100, func() {
+		x += m[string(buf)]
+	})
+	if n != 0 {
+		t.Errorf("AllocsPerRun for m[string(buf)] = %v, want 0", n)
+	}
+
+	x = 0
+	n = testing.AllocsPerRun(100, func() {
+		y, ok := m[string(buf)]
+		if !ok {
+			panic("!ok")
+		}
+		x += y
+	})
+	if n != 0 {
+		t.Errorf("AllocsPerRun for x,ok = m[string(buf)] = %v, want 0", n)
+	}
+}
+
+func benchmarkMapPop(b *testing.B, n int) {
+	m := map[int]int{}
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < n; j++ {
+			m[j] = j
+		}
+		for j := 0; j < n; j++ {
+			// Use iterator to pop an element.
+			// We want this to be fast, see issue 8412.
+			for k := range m {
+				delete(m, k)
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkMapPop100(b *testing.B)   { benchmarkMapPop(b, 100) }
+func BenchmarkMapPop1000(b *testing.B)  { benchmarkMapPop(b, 1000) }
+func BenchmarkMapPop10000(b *testing.B) { benchmarkMapPop(b, 10000) }
diff --git a/src/runtime/mapspeed_test.go b/src/runtime/mapspeed_test.go
new file mode 100644
index 000000000..119eb3f39
--- /dev/null
+++ b/src/runtime/mapspeed_test.go
@@ -0,0 +1,300 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package runtime_test
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+)
+
+const size = 10
+
+func BenchmarkHashStringSpeed(b *testing.B) {
+	strings := make([]string, size)
+	for i := 0; i < size; i++ {
+		strings[i] = fmt.Sprintf("string#%d", i)
+	}
+	sum := 0
+	m := make(map[string]int, size)
+	for i := 0; i < size; i++ {
+		m[strings[i]] = 0
+	}
+	idx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		sum += m[strings[idx]]
+		idx++
+		if idx == size {
+			idx = 0
+		}
+	}
+}
+
+type chunk [17]byte
+
+func BenchmarkHashBytesSpeed(b *testing.B) {
+	// a bunch of chunks, each with a different alignment mod 16
+	var chunks [size]chunk
+	// initialize each to a different value
+	for i := 0; i < size; i++ {
+		chunks[i][0] = byte(i)
+	}
+	// put into a map
+	m := make(map[chunk]int, size)
+	for i, c := range chunks {
+		m[c] = i
+	}
+	idx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if m[chunks[idx]] != idx {
+			b.Error("bad map entry for chunk")
+		}
+		idx++
+		if idx == size {
+			idx = 0
+		}
+	}
+}
+
+func BenchmarkHashInt32Speed(b *testing.B) {
+	ints := make([]int32, size)
+	for i := 0; i < size; i++ {
+		ints[i] = int32(i)
+	}
+	sum := 0
+	m := make(map[int32]int, size)
+	for i := 0; i < size; i++ {
+		m[ints[i]] = 0
+	}
+	idx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		sum += m[ints[idx]]
+		idx++
+		if idx == size {
+			idx = 0
+		}
+	}
+}
+
+func BenchmarkHashInt64Speed(b *testing.B) {
+	ints := make([]int64, size)
+	for i := 0; i < size; i++ {
+		ints[i] = int64(i)
+	}
+	sum := 0
+	m := make(map[int64]int, size)
+	for i := 0; i < size; i++ {
+		m[ints[i]] = 0
+	}
+	idx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		sum += m[ints[idx]]
+		idx++
+		if idx == size {
+			idx = 0
+		}
+	}
+}
+func BenchmarkHashStringArraySpeed(b *testing.B) {
+	stringpairs := make([][2]string, size)
+	for i := 0; i < size; i++ {
+		for j := 0; j < 2; j++ {
+			stringpairs[i][j] = fmt.Sprintf("string#%d/%d", i, j)
+		}
+	}
+	sum := 0
+	m := make(map[[2]string]int, size)
+	for i := 0; i < size; i++ {
+		m[stringpairs[i]] = 0
+	}
+	idx := 0
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		sum += m[stringpairs[idx]]
+		idx++
+		if idx == size {
+			idx = 0
+		}
+	}
+}
+
+func BenchmarkMegMap(b *testing.B) {
+	m := make(map[string]bool)
+	for suffix := 'A'; suffix <= 'G'; suffix++ {
+		m[strings.Repeat("X", 1<<20-1)+fmt.Sprint(suffix)] = true
+	}
+	key := strings.Repeat("X", 1<<20-1) + "k"
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = m[key]
+	}
+}
+
+func BenchmarkMegOneMap(b *testing.B) {
+	m := make(map[string]bool)
+	m[strings.Repeat("X", 1<<20)] = true
+	key := strings.Repeat("Y", 1<<20)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = m[key]
+	}
+}
+
+func BenchmarkMegEqMap(b *testing.B) {
+	m := make(map[string]bool)
+	key1 := strings.Repeat("X", 1<<20)
+	key2 := strings.Repeat("X", 1<<20) // equal but different instance
+	m[key1] = true
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = m[key2]
+	}
+}
+
+func BenchmarkMegEmptyMap(b *testing.B) {
+	m := make(map[string]bool)
+	key := strings.Repeat("X", 1<<20)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = m[key]
+	}
+}
+
+func BenchmarkSmallStrMap(b *testing.B) {
+	m := make(map[string]bool)
+	for suffix := 'A'; suffix <= 'G'; suffix++ {
+		m[fmt.Sprint(suffix)] = true
+	}
+	key := "k"
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = m[key]
+	}
+}
+
+func BenchmarkMapStringKeysEight_16(b *testing.B) { benchmarkMapStringKeysEight(b, 16) }
+func BenchmarkMapStringKeysEight_32(b *testing.B) { benchmarkMapStringKeysEight(b, 32) }
+func BenchmarkMapStringKeysEight_64(b *testing.B) { benchmarkMapStringKeysEight(b, 64) }
+func BenchmarkMapStringKeysEight_1M(b *testing.B) { benchmarkMapStringKeysEight(b, 1<<20) }
+
+func benchmarkMapStringKeysEight(b *testing.B, keySize int) {
+	m := make(map[string]bool)
+	for i := 0; i < 8; i++ {
+		m[strings.Repeat("K", i+1)] = true
+	}
+	key := strings.Repeat("K", keySize)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = m[key]
+	}
+}
+
+func BenchmarkIntMap(b *testing.B) {
+	m := make(map[int]bool)
+	for i := 0; i < 8; i++ {
+		m[i] = true
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _ = m[7]
+	}
+}
+
+// Accessing the same keys in a row.
+func benchmarkRepeatedLookup(b *testing.B, lookupKeySize int) {
+	m := make(map[string]bool)
+	// At least bigger than a single bucket:
+	for i := 0; i < 64; i++ {
+		m[fmt.Sprintf("some key %d", i)] = true
+	}
+	base := strings.Repeat("x", lookupKeySize-1)
+	key1 := base + "1"
+	key2 := base + "2"
+	b.ResetTimer()
+	for i := 0; i < b.N/4; i++ {
+		_ = m[key1]
+		_ = m[key1]
+		_ = m[key2]
+		_ = m[key2]
+	}
+}
+
+func BenchmarkRepeatedLookupStrMapKey32(b *testing.B) { benchmarkRepeatedLookup(b, 32) }
+func BenchmarkRepeatedLookupStrMapKey1M(b *testing.B) { benchmarkRepeatedLookup(b, 1<<20) }
+
+func BenchmarkNewEmptyMap(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		_ = make(map[int]int)
+	}
+}
+
+func BenchmarkMapIter(b *testing.B) {
+	m := make(map[int]bool)
+	for i := 0; i < 8; i++ {
+		m[i] = true
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for range m {
+		}
+	}
+}
+
+func BenchmarkMapIterEmpty(b *testing.B) {
+	m := make(map[int]bool)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for range m {
+		}
+	}
+}
+
+func BenchmarkSameLengthMap(b *testing.B) {
+	// long strings, same length, differ in first few
+	// and last few bytes.
+	m := make(map[string]bool)
+	s1 := "foo" + strings.Repeat("-", 100) + "bar"
+	s2 := "goo" + strings.Repeat("-", 100) + "ber"
+	m[s1] = true
+	m[s2] = true
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = m[s1]
+	}
+}
+
+type BigKey [3]int64
+
+func BenchmarkBigKeyMap(b *testing.B) {
+	m := make(map[BigKey]bool)
+	k := BigKey{3, 4, 5}
+	m[k] = true
+	for i := 0; i < b.N; i++ {
+		_ = m[k]
+	}
+}
+
+type BigVal [3]int64
+
+func BenchmarkBigValMap(b *testing.B) {
+	m := make(map[BigKey]BigVal)
+	k := BigKey{3, 4, 5}
+	m[k] = BigVal{6, 7, 8}
+	for i := 0; i < b.N; i++ {
+		_ = m[k]
+	}
+}
+
+func BenchmarkSmallKeyMap(b *testing.B) {
+	m := make(map[int16]bool)
+	m[5] = true
+	for i := 0; i < b.N; i++ {
+		_ = m[5]
+	}
+}
diff --git a/src/runtime/mcache.c b/src/runtime/mcache.c
new file mode 100644
index 000000000..5fdbe3266
--- /dev/null
+++ b/src/runtime/mcache.c
@@ -0,0 +1,115 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Per-P malloc cache for small objects.
+//
+// See malloc.h for an overview.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+
+extern volatile intgo runtime·MemProfileRate;
+
+// dummy MSpan that contains no free objects.
+MSpan runtime·emptymspan;
+
+MCache*
+runtime·allocmcache(void)
+{
+	intgo rate;
+	MCache *c;
+	int32 i;
+
+	runtime·lock(&runtime·mheap.lock);
+	c = runtime·FixAlloc_Alloc(&runtime·mheap.cachealloc);
+	runtime·unlock(&runtime·mheap.lock);
+	runtime·memclr((byte*)c, sizeof(*c));
+	for(i = 0; i < NumSizeClasses; i++)
+		c->alloc[i] = &runtime·emptymspan;
+
+	// Set first allocation sample size.
+	rate = runtime·MemProfileRate;
+	if(rate > 0x3fffffff)	// make 2*rate not overflow
+		rate = 0x3fffffff;
+	if(rate != 0)
+		c->next_sample = runtime·fastrand1() % (2*rate);
+
+	return c;
+}
+
+static void
+freemcache(MCache *c)
+{
+	runtime·MCache_ReleaseAll(c);
+	runtime·stackcache_clear(c);
+	runtime·gcworkbuffree(c->gcworkbuf);
+	runtime·lock(&runtime·mheap.lock);
+	runtime·purgecachedstats(c);
+	runtime·FixAlloc_Free(&runtime·mheap.cachealloc, c);
+	runtime·unlock(&runtime·mheap.lock);
+}
+
+static void
+freemcache_m(void)
+{
+	MCache *c;
+
+	c = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	freemcache(c);
+}
+
+void
+runtime·freemcache(MCache *c)
+{
+	void (*fn)(void);
+
+	g->m->ptrarg[0] = c;
+	fn = freemcache_m;
+	runtime·onM(&fn);
+}
+
+// Gets a span that has a free object in it and assigns it
+// to be the cached span for the given sizeclass.  Returns this span.
+MSpan*
+runtime·MCache_Refill(MCache *c, int32 sizeclass)
+{
+	MSpan *s;
+
+	g->m->locks++;
+	// Return the current cached span to the central lists.
+	s = c->alloc[sizeclass];
+	if(s->freelist != nil)
+		runtime·throw("refill on a nonempty span");
+	if(s != &runtime·emptymspan)
+		s->incache = false;
+
+	// Get a new cached span from the central lists.
+	s = runtime·MCentral_CacheSpan(&runtime·mheap.central[sizeclass].mcentral);
+	if(s == nil)
+		runtime·throw("out of memory");
+	if(s->freelist == nil) {
+		runtime·printf("%d %d\n", s->ref, (int32)((s->npages << PageShift) / s->elemsize));
+		runtime·throw("empty span");
+	}
+	c->alloc[sizeclass] = s;
+	g->m->locks--;
+	return s;
+}
+
+void
+runtime·MCache_ReleaseAll(MCache *c)
+{
+	int32 i;
+	MSpan *s;
+
+	for(i=0; i<NumSizeClasses; i++) {
+		s = c->alloc[i];
+		if(s != &runtime·emptymspan) {
+			runtime·MCentral_UncacheSpan(&runtime·mheap.central[i].mcentral, s);
+			c->alloc[i] = &runtime·emptymspan;
+		}
+	}
+}
diff --git a/src/runtime/mcentral.c b/src/runtime/mcentral.c
new file mode 100644
index 000000000..fe6bcfeb1
--- /dev/null
+++ b/src/runtime/mcentral.c
@@ -0,0 +1,214 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Central free lists.
+//
+// See malloc.h for an overview.
+//
+// The MCentral doesn't actually contain the list of free objects; the MSpan does.
+// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
+// and those that are completely allocated (c->empty).
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+
+static MSpan* MCentral_Grow(MCentral *c);
+
+// Initialize a single central free list.
+void
+runtime·MCentral_Init(MCentral *c, int32 sizeclass)
+{
+	c->sizeclass = sizeclass;
+	runtime·MSpanList_Init(&c->nonempty);
+	runtime·MSpanList_Init(&c->empty);
+}
+
+// Allocate a span to use in an MCache.
+MSpan*
+runtime·MCentral_CacheSpan(MCentral *c)
+{
+	MSpan *s;
+	int32 cap, n;
+	uint32 sg;
+
+	runtime·lock(&c->lock);
+	sg = runtime·mheap.sweepgen;
+retry:
+	for(s = c->nonempty.next; s != &c->nonempty; s = s->next) {
+		if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+			runtime·MSpanList_Remove(s);
+			runtime·MSpanList_InsertBack(&c->empty, s);
+			runtime·unlock(&c->lock);
+			runtime·MSpan_Sweep(s, true);
+			goto havespan;
+		}
+		if(s->sweepgen == sg-1) {
+			// the span is being swept by background sweeper, skip
+			continue;
+		}
+		// we have a nonempty span that does not require sweeping, allocate from it
+		runtime·MSpanList_Remove(s);
+		runtime·MSpanList_InsertBack(&c->empty, s);
+		runtime·unlock(&c->lock);
+		goto havespan;
+	}
+
+	for(s = c->empty.next; s != &c->empty; s = s->next) {
+		if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+			// we have an empty span that requires sweeping,
+			// sweep it and see if we can free some space in it
+			runtime·MSpanList_Remove(s);
+			// swept spans are at the end of the list
+			runtime·MSpanList_InsertBack(&c->empty, s);
+			runtime·unlock(&c->lock);
+			runtime·MSpan_Sweep(s, true);
+			if(s->freelist != nil)
+				goto havespan;
+			runtime·lock(&c->lock);
+			// the span is still empty after sweep
+			// it is already in the empty list, so just retry
+			goto retry;
+		}
+		if(s->sweepgen == sg-1) {
+			// the span is being swept by background sweeper, skip
+			continue;
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break;
+	}
+	runtime·unlock(&c->lock);
+
+	// Replenish central list if empty.
+	s = MCentral_Grow(c);
+	if(s == nil)
+		return nil;
+	runtime·lock(&c->lock);
+	runtime·MSpanList_InsertBack(&c->empty, s);
+	runtime·unlock(&c->lock);
+
+havespan:
+	// At this point s is a non-empty span, queued at the end of the empty list,
+	// c is unlocked.
+	cap = (s->npages << PageShift) / s->elemsize;
+	n = cap - s->ref;
+	if(n == 0)
+		runtime·throw("empty span");
+	if(s->freelist == nil)
+		runtime·throw("freelist empty");
+	s->incache = true;
+	return s;
+}
+
+// Return span from an MCache.
+void
+runtime·MCentral_UncacheSpan(MCentral *c, MSpan *s)
+{
+	int32 cap, n;
+
+	runtime·lock(&c->lock);
+
+	s->incache = false;
+
+	if(s->ref == 0)
+		runtime·throw("uncaching full span");
+
+	cap = (s->npages << PageShift) / s->elemsize;
+	n = cap - s->ref;
+	if(n > 0) {
+		runtime·MSpanList_Remove(s);
+		runtime·MSpanList_Insert(&c->nonempty, s);
+	}
+	runtime·unlock(&c->lock);
+}
+
+// Free n objects from a span s back into the central free list c.
+// Called during sweep.
+// Returns true if the span was returned to heap.  Sets sweepgen to
+// the latest generation.
+// If preserve=true, don't return the span to heap nor relink in MCentral lists;
+// caller takes care of it.
+bool
+runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end, bool preserve)
+{
+	bool wasempty;
+
+	if(s->incache)
+		runtime·throw("freespan into cached span");
+
+	// Add the objects back to s's free list.
+	wasempty = s->freelist == nil;
+	end->next = s->freelist;
+	s->freelist = start;
+	s->ref -= n;
+
+	if(preserve) {
+		// preserve is set only when called from MCentral_CacheSpan above,
+		// the span must be in the empty list.
+		if(s->next == nil)
+			runtime·throw("can't preserve unlinked span");
+		runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
+		return false;
+	}
+
+	runtime·lock(&c->lock);
+
+	// Move to nonempty if necessary.
+	if(wasempty) {
+		runtime·MSpanList_Remove(s);
+		runtime·MSpanList_Insert(&c->nonempty, s);
+	}
+
+	// delay updating sweepgen until here.  This is the signal that
+	// the span may be used in an MCache, so it must come after the
+	// linked list operations above (actually, just after the
+	// lock of c above.)
+	runtime·atomicstore(&s->sweepgen, runtime·mheap.sweepgen);
+
+	if(s->ref != 0) {
+		runtime·unlock(&c->lock);
+		return false;
+	}
+
+	// s is completely freed, return it to the heap.
+	runtime·MSpanList_Remove(s);
+	s->needzero = 1;
+	s->freelist = nil;
+	runtime·unlock(&c->lock);
+	runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
+	runtime·MHeap_Free(&runtime·mheap, s, 0);
+	return true;
+}
+
+// Fetch a new span from the heap and carve into objects for the free list.
+static MSpan*
+MCentral_Grow(MCentral *c)
+{
+	uintptr size, npages, i, n;
+	MLink **tailp, *v;
+	byte *p;
+	MSpan *s;
+
+	npages = runtime·class_to_allocnpages[c->sizeclass];
+	size = runtime·class_to_size[c->sizeclass];
+	n = (npages << PageShift) / size;
+	s = runtime·MHeap_Alloc(&runtime·mheap, npages, c->sizeclass, 0, 1);
+	if(s == nil)
+		return nil;
+
+	// Carve span into sequence of blocks.
+	tailp = &s->freelist;
+	p = (byte*)(s->start << PageShift);
+	s->limit = p + size*n;
+	for(i=0; i<n; i++) {
+		v = (MLink*)p;
+		*tailp = v;
+		tailp = &v->next;
+		p += size;
+	}
+	*tailp = nil;
+	runtime·markspan((byte*)(s->start<<PageShift), size, n, size*n < (s->npages<<PageShift));
+	return s;
+}
diff --git a/src/runtime/mem.go b/src/runtime/mem.go
new file mode 100644
index 000000000..e6f1eb0e6
--- /dev/null
+++ b/src/runtime/mem.go
@@ -0,0 +1,108 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// Note: the MemStats struct should be kept in sync with
+// struct MStats in malloc.h
+
+// A MemStats records statistics about the memory allocator.
+type MemStats struct {
+	// General statistics.
+	Alloc      uint64 // bytes allocated and still in use
+	TotalAlloc uint64 // bytes allocated (even if freed)
+	Sys        uint64 // bytes obtained from system (sum of XxxSys below)
+	Lookups    uint64 // number of pointer lookups
+	Mallocs    uint64 // number of mallocs
+	Frees      uint64 // number of frees
+
+	// Main allocation heap statistics.
+	HeapAlloc    uint64 // bytes allocated and still in use
+	HeapSys      uint64 // bytes obtained from system
+	HeapIdle     uint64 // bytes in idle spans
+	HeapInuse    uint64 // bytes in non-idle span
+	HeapReleased uint64 // bytes released to the OS
+	HeapObjects  uint64 // total number of allocated objects
+
+	// Low-level fixed-size structure allocator statistics.
+	//	Inuse is bytes used now.
+	//	Sys is bytes obtained from system.
+	StackInuse  uint64 // bytes used by stack allocator
+	StackSys    uint64
+	MSpanInuse  uint64 // mspan structures
+	MSpanSys    uint64
+	MCacheInuse uint64 // mcache structures
+	MCacheSys   uint64
+	BuckHashSys uint64 // profiling bucket hash table
+	GCSys       uint64 // GC metadata
+	OtherSys    uint64 // other system allocations
+
+	// Garbage collector statistics.
+	NextGC       uint64 // next collection will happen when HeapAlloc ≥ this amount
+	LastGC       uint64 // end time of last collection (nanoseconds since 1970)
+	PauseTotalNs uint64
+	PauseNs      [256]uint64 // circular buffer of recent GC pause durations, most recent at [(NumGC+255)%256]
+	PauseEnd     [256]uint64 // circular buffer of recent GC pause end times
+	NumGC        uint32
+	EnableGC     bool
+	DebugGC      bool
+
+	// Per-size allocation statistics.
+	// 61 is NumSizeClasses in the C code.
+	BySize [61]struct {
+		Size    uint32
+		Mallocs uint64
+		Frees   uint64
+	}
+}
+
+var sizeof_C_MStats uintptr // filled in by malloc.goc
+
+func init() {
+	var memStats MemStats
+	if sizeof_C_MStats != unsafe.Sizeof(memStats) {
+		println(sizeof_C_MStats, unsafe.Sizeof(memStats))
+		gothrow("MStats vs MemStatsType size mismatch")
+	}
+}
+
+// ReadMemStats populates m with memory allocator statistics.
+func ReadMemStats(m *MemStats) {
+	// Have to acquire worldsema to stop the world,
+	// because stoptheworld can only be used by
+	// one goroutine at a time, and there might be
+	// a pending garbage collection already calling it.
+	semacquire(&worldsema, false)
+	gp := getg()
+	gp.m.gcing = 1
+	onM(stoptheworld)
+
+	gp.m.ptrarg[0] = noescape(unsafe.Pointer(m))
+	onM(readmemstats_m)
+
+	gp.m.gcing = 0
+	gp.m.locks++
+	semrelease(&worldsema)
+	onM(starttheworld)
+	gp.m.locks--
+}
+
+// Implementation of runtime/debug.WriteHeapDump
+func writeHeapDump(fd uintptr) {
+	semacquire(&worldsema, false)
+	gp := getg()
+	gp.m.gcing = 1
+	onM(stoptheworld)
+
+	gp.m.scalararg[0] = fd
+	onM(writeheapdump_m)
+
+	gp.m.gcing = 0
+	gp.m.locks++
+	semrelease(&worldsema)
+	onM(starttheworld)
+	gp.m.locks--
+}
diff --git a/src/runtime/mem_darwin.c b/src/runtime/mem_darwin.c
new file mode 100644
index 000000000..bf3ede577
--- /dev/null
+++ b/src/runtime/mem_darwin.c
@@ -0,0 +1,82 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096)
+		return nil;
+	runtime·xadd64(stat, n);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	// Linux's MADV_DONTNEED is like BSD's MADV_FREE.
+	runtime·madvise(v, n, MADV_FREE);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	*reserved = true;
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	return p;
+}
+
+enum
+{
+	ENOMEM = 12,
+};
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	USED(reserved);
+
+	runtime·xadd64(stat, n);
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_dragonfly.c b/src/runtime/mem_dragonfly.c
new file mode 100644
index 000000000..11457b2c0
--- /dev/null
+++ b/src/runtime/mem_dragonfly.c
@@ -0,0 +1,105 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	ENOMEM = 12,
+};
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096)
+		return nil;
+	runtime·xadd64(stat, n);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	runtime·madvise(v, n, MADV_FREE);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// and check the assumption in SysMap.
+	if(sizeof(void*) == 8 && n > 1LL<<32) {
+		*reserved = false;
+		return v;
+	}
+
+	*reserved = true;
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		// TODO(jsing): For some reason DragonFly seems to return
+		// memory at a different address than we requested, even when
+		// there should be no reason for it to do so. This can be
+		// avoided by using MAP_FIXED, but I'm not sure we should need
+		// to do this - we do not on other platforms.
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM)
+			runtime·throw("runtime: out of memory");
+		if(p != v) {
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_freebsd.c b/src/runtime/mem_freebsd.c
new file mode 100644
index 000000000..18a9a2f5b
--- /dev/null
+++ b/src/runtime/mem_freebsd.c
@@ -0,0 +1,100 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	ENOMEM = 12,
+};
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096)
+		return nil;
+	runtime·xadd64(stat, n);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	runtime·madvise(v, n, MADV_FREE);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// and check the assumption in SysMap.
+	if(sizeof(void*) == 8 && n > 1LL<<32) {
+		*reserved = false;
+		return v;
+	}
+
+	*reserved = true;
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM)
+			runtime·throw("runtime: out of memory");
+		if(p != v) {
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_linux.c b/src/runtime/mem_linux.c
new file mode 100644
index 000000000..bfb405607
--- /dev/null
+++ b/src/runtime/mem_linux.c
@@ -0,0 +1,162 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	_PAGE_SIZE = 4096,
+	EACCES = 13,
+};
+
+static int32
+addrspace_free(void *v, uintptr n)
+{
+	int32 errval;
+	uintptr chunk;
+	uintptr off;
+	
+	// NOTE: vec must be just 1 byte long here.
+	// Mincore returns ENOMEM if any of the pages are unmapped,
+	// but we want to know that all of the pages are unmapped.
+	// To make these the same, we can only ask about one page
+	// at a time. See golang.org/issue/7476.
+	static byte vec[1];
+
+	for(off = 0; off < n; off += chunk) {
+		chunk = _PAGE_SIZE * sizeof vec;
+		if(chunk > (n - off))
+			chunk = n - off;
+		errval = runtime·mincore((int8*)v + off, chunk, vec);
+		// ENOMEM means unmapped, which is what we want.
+		// Anything else we assume means the pages are mapped.
+		if (errval != -ENOMEM)
+			return 0;
+	}
+	return 1;
+}
+
+static void *
+mmap_fixed(byte *v, uintptr n, int32 prot, int32 flags, int32 fd, uint32 offset)
+{
+	void *p;
+
+	p = runtime·mmap(v, n, prot, flags, fd, offset);
+	if(p != v && addrspace_free(v, n)) {
+		// On some systems, mmap ignores v without
+		// MAP_FIXED, so retry if the address space is free.
+		if(p > (void*)4096)
+			runtime·munmap(p, n);
+		p = runtime·mmap(v, n, prot, flags|MAP_FIXED, fd, offset);
+	}
+	return p;
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *p;
+
+	p = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096) {
+		if(p == (void*)EACCES) {
+			runtime·printf("runtime: mmap: access denied\n");
+			runtime·printf("if you're running SELinux, enable execmem for this process.\n");
+			runtime·exit(2);
+		}
+		if(p == (void*)EAGAIN) {
+			runtime·printf("runtime: mmap: too much locked memory (check 'ulimit -l').\n");
+			runtime·exit(2);
+		}
+		return nil;
+	}
+	runtime·xadd64(stat, n);
+	return p;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	runtime·madvise(v, n, MADV_DONTNEED);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// if we can reserve at least 64K and check the assumption in SysMap.
+	// Only user-mode Linux (UML) rejects these requests.
+	if(sizeof(void*) == 8 && n > 1LL<<32) {
+		p = mmap_fixed(v, 64<<10, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if (p != v) {
+			if(p >= (void*)4096)
+				runtime·munmap(p, 64<<10);
+			return nil;
+		}
+		runtime·munmap(p, 64<<10);
+		*reserved = false;
+		return v;
+	}
+
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if((uintptr)p < 4096)
+		return nil;
+	*reserved = true;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM)
+			runtime·throw("runtime: out of memory");
+		if(p != v) {
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_nacl.c b/src/runtime/mem_nacl.c
new file mode 100644
index 000000000..6c836f18a
--- /dev/null
+++ b/src/runtime/mem_nacl.c
@@ -0,0 +1,120 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	Debug = 0,
+};
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096) {
+		if(Debug)
+			runtime·printf("sysAlloc(%p): %p\n", n, v);
+		return nil;
+	}
+	runtime·xadd64(stat, n);
+	if(Debug)
+		runtime·printf("sysAlloc(%p) = %p\n", n, v);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	if(Debug)
+		runtime·printf("SysUnused(%p, %p)\n", v, n);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	if(Debug)
+		runtime·printf("SysFree(%p, %p)\n", v, n);
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// and check the assumption in SysMap.
+	if(NaCl || sizeof(void*) == 8) {
+		*reserved = false;
+		return v;
+	}
+	
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	*reserved = true;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM) {
+			runtime·printf("SysMap(%p, %p): %p\n", v, n, p);
+			runtime·throw("runtime: out of memory");
+		}
+		if(p != v) {
+			runtime·printf("SysMap(%p, %p): %p\n", v, n, p);
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		if(Debug)
+			runtime·printf("SysMap(%p, %p) = %p\n", v, n, p);
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM) {
+		runtime·printf("SysMap(%p, %p): %p\n", v, n, p);
+		runtime·throw("runtime: out of memory");
+	}
+	if(p != v) {
+		runtime·printf("SysMap(%p, %p): %p\n", v, n, p);
+		runtime·printf("mmap MAP_FIXED %p returned %p\n", v, p);
+		runtime·throw("runtime: cannot map pages in arena address space");
+	}
+	if(Debug)
+		runtime·printf("SysMap(%p, %p) = %p\n", v, n, p);
+}
diff --git a/src/runtime/mem_netbsd.c b/src/runtime/mem_netbsd.c
new file mode 100644
index 000000000..31820e517
--- /dev/null
+++ b/src/runtime/mem_netbsd.c
@@ -0,0 +1,100 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	ENOMEM = 12,
+};
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096)
+		return nil;
+	runtime·xadd64(stat, n);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	runtime·madvise(v, n, MADV_FREE);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// and check the assumption in SysMap.
+	if(sizeof(void*) == 8 && n > 1LL<<32) {
+		*reserved = false;
+		return v;
+	}
+
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	*reserved = true;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM)
+			runtime·throw("runtime: out of memory");
+		if(p != v) {
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_openbsd.c b/src/runtime/mem_openbsd.c
new file mode 100644
index 000000000..31820e517
--- /dev/null
+++ b/src/runtime/mem_openbsd.c
@@ -0,0 +1,100 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	ENOMEM = 12,
+};
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096)
+		return nil;
+	runtime·xadd64(stat, n);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	runtime·madvise(v, n, MADV_FREE);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// and check the assumption in SysMap.
+	if(sizeof(void*) == 8 && n > 1LL<<32) {
+		*reserved = false;
+		return v;
+	}
+
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	*reserved = true;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM)
+			runtime·throw("runtime: out of memory");
+		if(p != v) {
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_plan9.c b/src/runtime/mem_plan9.c
new file mode 100644
index 000000000..d673d6f83
--- /dev/null
+++ b/src/runtime/mem_plan9.c
@@ -0,0 +1,121 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "os_GOOS.h"
+#include "textflag.h"
+
+extern byte runtime·end[];
+#pragma dataflag NOPTR
+static byte *bloc = { runtime·end };
+static Mutex memlock;
+
+enum
+{
+	Round = PAGESIZE-1
+};
+
+static void*
+brk(uintptr nbytes)
+{
+	uintptr bl;
+
+	runtime·lock(&memlock);
+	// Plan 9 sbrk from /sys/src/libc/9sys/sbrk.c
+	bl = ((uintptr)bloc + Round) & ~Round;
+	if(runtime·brk_((void*)(bl + nbytes)) < 0) {
+		runtime·unlock(&memlock);
+		return nil;
+	}
+	bloc = (byte*)bl + nbytes;
+	runtime·unlock(&memlock);
+	return (void*)bl;	
+}
+
+static void
+sysalloc(void)
+{
+	uintptr nbytes;
+	uint64 *stat;
+	void *p;
+
+	nbytes = g->m->scalararg[0];
+	stat = g->m->ptrarg[0];
+	g->m->scalararg[0] = 0;
+	g->m->ptrarg[0] = nil;
+
+	p = brk(nbytes);
+	if(p != nil)
+		runtime·xadd64(stat, nbytes);
+
+	g->m->ptrarg[0] = p;
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr nbytes, uint64 *stat)
+{
+	void (*fn)(void);
+	void *p;
+
+	g->m->scalararg[0] = nbytes;
+	g->m->ptrarg[0] = stat;
+	fn = sysalloc;
+	runtime·onM(&fn);
+	p = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	return p;
+}
+
+void
+runtime·SysFree(void *v, uintptr nbytes, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)nbytes);
+	runtime·lock(&memlock);
+	// from tiny/mem.c
+	// Push pointer back if this is a free
+	// of the most recent sysAlloc.
+	nbytes += (nbytes + Round) & ~Round;
+	if(bloc == (byte*)v+nbytes)
+		bloc -= nbytes;
+	runtime·unlock(&memlock);
+}
+
+void
+runtime·SysUnused(void *v, uintptr nbytes)
+{
+	USED(v, nbytes);
+}
+
+void
+runtime·SysUsed(void *v, uintptr nbytes)
+{
+	USED(v, nbytes);
+}
+
+void
+runtime·SysMap(void *v, uintptr nbytes, bool reserved, uint64 *stat)
+{
+	// SysReserve has already allocated all heap memory,
+	// but has not adjusted stats.
+	USED(v, reserved);
+	runtime·xadd64(stat, nbytes);
+}
+
+void
+runtime·SysFault(void *v, uintptr nbytes)
+{
+	USED(v, nbytes);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr nbytes, bool *reserved)
+{
+	USED(v);
+	*reserved = true;
+	return brk(nbytes);
+}
diff --git a/src/runtime/mem_solaris.c b/src/runtime/mem_solaris.c
new file mode 100644
index 000000000..8e90ba1d9
--- /dev/null
+++ b/src/runtime/mem_solaris.c
@@ -0,0 +1,101 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum
+{
+	ENOMEM = 12,
+};
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	void *v;
+
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(v < (void*)4096)
+		return nil;
+	runtime·xadd64(stat, n);
+	return v;
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	USED(v);
+	USED(n);
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, -(uint64)n);
+	runtime·munmap(v, n);
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	void *p;
+
+	// On 64-bit, people with ulimit -v set complain if we reserve too
+	// much address space.  Instead, assume that the reservation is okay
+	// and check the assumption in SysMap.
+	if(sizeof(void*) == 8 && n > 1LL<<32) {
+		*reserved = false;
+		return v;
+	}
+	
+	p = runtime·mmap(v, n, PROT_NONE, MAP_ANON|MAP_PRIVATE, -1, 0);
+	if(p < (void*)4096)
+		return nil;
+	*reserved = true;
+	return p;
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+	
+	runtime·xadd64(stat, n);
+
+	// On 64-bit, we don't actually have v reserved, so tread carefully.
+	if(!reserved) {
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
+		if(p == (void*)ENOMEM)
+			runtime·throw("runtime: out of memory");
+		if(p != v) {
+			runtime·printf("runtime: address space conflict: map(%p) = %p\n", v, p);
+			runtime·throw("runtime: address space conflict");
+		}
+		return;
+	}
+
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	if(p == (void*)ENOMEM)
+		runtime·throw("runtime: out of memory");
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/mem_windows.c b/src/runtime/mem_windows.c
new file mode 100644
index 000000000..6ea992020
--- /dev/null
+++ b/src/runtime/mem_windows.c
@@ -0,0 +1,132 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "os_GOOS.h"
+#include "defs_GOOS_GOARCH.h"
+#include "malloc.h"
+#include "textflag.h"
+
+enum {
+	MEM_COMMIT = 0x1000,
+	MEM_RESERVE = 0x2000,
+	MEM_DECOMMIT = 0x4000,
+	MEM_RELEASE = 0x8000,
+	
+	PAGE_READWRITE = 0x0004,
+	PAGE_NOACCESS = 0x0001,
+};
+
+#pragma dynimport runtime·VirtualAlloc VirtualAlloc "kernel32.dll"
+#pragma dynimport runtime·VirtualFree VirtualFree "kernel32.dll"
+#pragma dynimport runtime·VirtualProtect VirtualProtect "kernel32.dll"
+extern void *runtime·VirtualAlloc;
+extern void *runtime·VirtualFree;
+extern void *runtime·VirtualProtect;
+
+#pragma textflag NOSPLIT
+void*
+runtime·sysAlloc(uintptr n, uint64 *stat)
+{
+	runtime·xadd64(stat, n);
+	return runtime·stdcall4(runtime·VirtualAlloc, 0, n, MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE);
+}
+
+void
+runtime·SysUnused(void *v, uintptr n)
+{
+	void *r;
+	uintptr small;
+
+	r = runtime·stdcall3(runtime·VirtualFree, (uintptr)v, n, MEM_DECOMMIT);
+	if(r != nil)
+		return;
+
+	// Decommit failed. Usual reason is that we've merged memory from two different
+	// VirtualAlloc calls, and Windows will only let each VirtualFree handle pages from
+	// a single VirtualAlloc. It is okay to specify a subset of the pages from a single alloc,
+	// just not pages from multiple allocs. This is a rare case, arising only when we're
+	// trying to give memory back to the operating system, which happens on a time
+	// scale of minutes. It doesn't have to be terribly fast. Instead of extra bookkeeping
+	// on all our VirtualAlloc calls, try freeing successively smaller pieces until
+	// we manage to free something, and then repeat. This ends up being O(n log n)
+	// in the worst case, but that's fast enough.
+	while(n > 0) {
+		small = n;
+		while(small >= 4096 && runtime·stdcall3(runtime·VirtualFree, (uintptr)v, small, MEM_DECOMMIT) == nil)
+			small = (small / 2) & ~(4096-1);
+		if(small < 4096)
+			runtime·throw("runtime: failed to decommit pages");
+		v = (byte*)v + small;
+		n -= small;
+	}
+}
+
+void
+runtime·SysUsed(void *v, uintptr n)
+{
+	void *r;
+	uintptr small;
+
+	r = runtime·stdcall4(runtime·VirtualAlloc, (uintptr)v, n, MEM_COMMIT, PAGE_READWRITE);
+	if(r != v)
+		runtime·throw("runtime: failed to commit pages");
+
+	// Commit failed. See SysUnused.
+	while(n > 0) {
+		small = n;
+		while(small >= 4096 && runtime·stdcall4(runtime·VirtualAlloc, (uintptr)v, small, MEM_COMMIT, PAGE_READWRITE) == nil)
+			small = (small / 2) & ~(4096-1);
+		if(small < 4096)
+			runtime·throw("runtime: failed to decommit pages");
+		v = (byte*)v + small;
+		n -= small;
+	}
+}
+
+void
+runtime·SysFree(void *v, uintptr n, uint64 *stat)
+{
+	uintptr r;
+
+	runtime·xadd64(stat, -(uint64)n);
+	r = (uintptr)runtime·stdcall3(runtime·VirtualFree, (uintptr)v, 0, MEM_RELEASE);
+	if(r == 0)
+		runtime·throw("runtime: failed to release pages");
+}
+
+void
+runtime·SysFault(void *v, uintptr n)
+{
+	// SysUnused makes the memory inaccessible and prevents its reuse
+	runtime·SysUnused(v, n);
+}
+
+void*
+runtime·SysReserve(void *v, uintptr n, bool *reserved)
+{
+	*reserved = true;
+	// v is just a hint.
+	// First try at v.
+	v = runtime·stdcall4(runtime·VirtualAlloc, (uintptr)v, n, MEM_RESERVE, PAGE_READWRITE);
+	if(v != nil)
+		return v;
+	
+	// Next let the kernel choose the address.
+	return runtime·stdcall4(runtime·VirtualAlloc, 0, n, MEM_RESERVE, PAGE_READWRITE);
+}
+
+void
+runtime·SysMap(void *v, uintptr n, bool reserved, uint64 *stat)
+{
+	void *p;
+
+	USED(reserved);
+
+	runtime·xadd64(stat, n);
+	p = runtime·stdcall4(runtime·VirtualAlloc, (uintptr)v, n, MEM_COMMIT, PAGE_READWRITE);
+	if(p != v)
+		runtime·throw("runtime: cannot map pages in arena address space");
+}
diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s
new file mode 100644
index 000000000..1520aea2e
--- /dev/null
+++ b/src/runtime/memclr_386.s
@@ -0,0 +1,130 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9
+
+#include "textflag.h"
+
+// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), DI
+	MOVL	n+4(FP), BX
+	XORL	AX, AX
+
+	// MOVOU seems always faster than REP STOSL.
+clr_tail:
+	TESTL	BX, BX
+	JEQ	clr_0
+	CMPL	BX, $2
+	JBE	clr_1or2
+	CMPL	BX, $4
+	JBE	clr_3or4
+	CMPL	BX, $8
+	JBE	clr_5through8
+	CMPL	BX, $16
+	JBE	clr_9through16
+	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
+	JEQ	nosse2
+	PXOR	X0, X0
+	CMPL	BX, $32
+	JBE	clr_17through32
+	CMPL	BX, $64
+	JBE	clr_33through64
+	CMPL	BX, $128
+	JBE	clr_65through128
+	CMPL	BX, $256
+	JBE	clr_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+
+clr_loop:
+	MOVOU	X0, 0(DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, 128(DI)
+	MOVOU	X0, 144(DI)
+	MOVOU	X0, 160(DI)
+	MOVOU	X0, 176(DI)
+	MOVOU	X0, 192(DI)
+	MOVOU	X0, 208(DI)
+	MOVOU	X0, 224(DI)
+	MOVOU	X0, 240(DI)
+	SUBL	$256, BX
+	ADDL	$256, DI
+	CMPL	BX, $256
+	JAE	clr_loop
+	JMP	clr_tail
+
+clr_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+clr_0:
+	RET
+clr_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+clr_5through8:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+clr_9through16:
+	MOVL	AX, (DI)
+	MOVL	AX, 4(DI)
+	MOVL	AX, -8(DI)(BX*1)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+clr_17through32:
+	MOVOU	X0, (DI)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+clr_33through64:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+clr_65through128:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+clr_129through256:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, -128(DI)(BX*1)
+	MOVOU	X0, -112(DI)(BX*1)
+	MOVOU	X0, -96(DI)(BX*1)
+	MOVOU	X0, -80(DI)(BX*1)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+nosse2:
+	MOVL	BX, CX
+	SHRL	$2, CX
+	REP
+	STOSL
+	ANDL	$3, BX
+	JNE	clr_tail
+	RET
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
new file mode 100644
index 000000000..94a2c7f23
--- /dev/null
+++ b/src/runtime/memclr_amd64.s
@@ -0,0 +1,119 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9
+
+#include "textflag.h"
+
+// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), DI
+	MOVQ	n+8(FP), BX
+	XORQ	AX, AX
+
+	// MOVOU seems always faster than REP STOSQ.
+clr_tail:
+	TESTQ	BX, BX
+	JEQ	clr_0
+	CMPQ	BX, $2
+	JBE	clr_1or2
+	CMPQ	BX, $4
+	JBE	clr_3or4
+	CMPQ	BX, $8
+	JBE	clr_5through8
+	CMPQ	BX, $16
+	JBE	clr_9through16
+	PXOR	X0, X0
+	CMPQ	BX, $32
+	JBE	clr_17through32
+	CMPQ	BX, $64
+	JBE	clr_33through64
+	CMPQ	BX, $128
+	JBE	clr_65through128
+	CMPQ	BX, $256
+	JBE	clr_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+	// TODO: for really big clears, use MOVNTDQ.
+
+clr_loop:
+	MOVOU	X0, 0(DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, 128(DI)
+	MOVOU	X0, 144(DI)
+	MOVOU	X0, 160(DI)
+	MOVOU	X0, 176(DI)
+	MOVOU	X0, 192(DI)
+	MOVOU	X0, 208(DI)
+	MOVOU	X0, 224(DI)
+	MOVOU	X0, 240(DI)
+	SUBQ	$256, BX
+	ADDQ	$256, DI
+	CMPQ	BX, $256
+	JAE	clr_loop
+	JMP	clr_tail
+
+clr_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+clr_0:
+	RET
+clr_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+clr_5through8:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+clr_9through16:
+	MOVQ	AX, (DI)
+	MOVQ	AX, -8(DI)(BX*1)
+	RET
+clr_17through32:
+	MOVOU	X0, (DI)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+clr_33through64:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+clr_65through128:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
+clr_129through256:
+	MOVOU	X0, (DI)
+	MOVOU	X0, 16(DI)
+	MOVOU	X0, 32(DI)
+	MOVOU	X0, 48(DI)
+	MOVOU	X0, 64(DI)
+	MOVOU	X0, 80(DI)
+	MOVOU	X0, 96(DI)
+	MOVOU	X0, 112(DI)
+	MOVOU	X0, -128(DI)(BX*1)
+	MOVOU	X0, -112(DI)(BX*1)
+	MOVOU	X0, -96(DI)(BX*1)
+	MOVOU	X0, -80(DI)(BX*1)
+	MOVOU	X0, -64(DI)(BX*1)
+	MOVOU	X0, -48(DI)(BX*1)
+	MOVOU	X0, -32(DI)(BX*1)
+	MOVOU	X0, -16(DI)(BX*1)
+	RET
diff --git a/src/runtime/memclr_arm.s b/src/runtime/memclr_arm.s
new file mode 100644
index 000000000..1824d33b1
--- /dev/null
+++ b/src/runtime/memclr_arm.s
@@ -0,0 +1,87 @@
+// Inferno's libkern/memset-arm.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/memset-arm.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+TO = 8
+TOE = 11
+N = 12
+TMP = 12				/* N and TMP don't overlap */
+
+TEXT runtime·memclr(SB),NOSPLIT,$0-8
+	MOVW	ptr+0(FP), R(TO)
+	MOVW	n+4(FP), R(N)
+	MOVW	$0, R(0)
+
+	ADD	R(N), R(TO), R(TOE)	/* to end pointer */
+
+	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	BLT	_1tail
+
+_4align:				/* align on 4 */
+	AND.S	$3, R(TO), R(TMP)
+	BEQ	_4aligned
+
+	MOVBU.P	R(0), 1(R(TO))		/* implicit write back */
+	B	_4align
+
+_4aligned:
+	SUB	$31, R(TOE), R(TMP)	/* do 32-byte chunks if possible */
+	CMP	R(TMP), R(TO)
+	BHS	_4tail
+
+	MOVW	R0, R1			/* replicate */
+	MOVW	R0, R2
+	MOVW	R0, R3
+	MOVW	R0, R4
+	MOVW	R0, R5
+	MOVW	R0, R6
+	MOVW	R0, R7
+
+_f32loop:
+	CMP	R(TMP), R(TO)
+	BHS	_4tail
+
+	MOVM.IA.W [R0-R7], (R(TO))
+	B	_f32loop
+
+_4tail:
+	SUB	$3, R(TOE), R(TMP)	/* do remaining words if possible */
+_4loop:
+	CMP	R(TMP), R(TO)
+	BHS	_1tail
+
+	MOVW.P	R(0), 4(R(TO))		/* implicit write back */
+	B	_4loop
+
+_1tail:
+	CMP	R(TO), R(TOE)
+	BEQ	_return
+
+	MOVBU.P	R(0), 1(R(TO))		/* implicit write back */
+	B	_1tail
+
+_return:
+	RET
diff --git a/src/runtime/memclr_plan9_386.s b/src/runtime/memclr_plan9_386.s
new file mode 100644
index 000000000..b4b671f77
--- /dev/null
+++ b/src/runtime/memclr_plan9_386.s
@@ -0,0 +1,51 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), DI
+	MOVL	n+4(FP), BX
+	XORL	AX, AX
+
+clr_tail:
+	TESTL	BX, BX
+	JEQ	clr_0
+	CMPL	BX, $2
+	JBE	clr_1or2
+	CMPL	BX, $4
+	JBE	clr_3or4
+	CMPL	BX, $8
+	JBE	clr_5through8
+	CMPL	BX, $16
+	JBE	clr_9through16
+	MOVL	BX, CX
+	SHRL	$2, CX
+	REP
+	STOSL
+	ANDL	$3, BX
+	JNE	clr_tail
+	RET
+
+clr_1or2:
+	MOVB	AX, (DI)
+	MOVB	AX, -1(DI)(BX*1)
+	RET
+clr_0:
+	RET
+clr_3or4:
+	MOVW	AX, (DI)
+	MOVW	AX, -2(DI)(BX*1)
+	RET
+clr_5through8:
+	MOVL	AX, (DI)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
+clr_9through16:
+	MOVL	AX, (DI)
+	MOVL	AX, 4(DI)
+	MOVL	AX, -8(DI)(BX*1)
+	MOVL	AX, -4(DI)(BX*1)
+	RET
diff --git a/src/runtime/memclr_plan9_amd64.s b/src/runtime/memclr_plan9_amd64.s
new file mode 100644
index 000000000..37e61dfbc
--- /dev/null
+++ b/src/runtime/memclr_plan9_amd64.s
@@ -0,0 +1,21 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB),NOSPLIT,$0-16
+	MOVQ	ptr+0(FP), DI
+	MOVQ	n+8(FP), CX
+	MOVQ	CX, BX
+	ANDQ	$7, BX
+	SHRQ	$3, CX
+	MOVQ	$0, AX
+	CLD
+	REP
+	STOSQ
+	MOVQ	BX, CX
+	REP
+	STOSB
+	RET
diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s
new file mode 100644
index 000000000..4c0c74c1a
--- /dev/null
+++ b/src/runtime/memmove_386.s
@@ -0,0 +1,176 @@
+// Inferno's libkern/memmove-386.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// +build !plan9
+
+#include "textflag.h"
+
+TEXT runtime·memmove(SB), NOSPLIT, $0-12
+	MOVL	to+0(FP), DI
+	MOVL	from+4(FP), SI
+	MOVL	n+8(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code.  The REP MOVSL instruction is really fast
+	// for large sizes.  The cutover is approximately 1K.  We implement up to
+	// 128 because that is the maximum SSE register load (loading all data
+	// into registers lets us ignore copy direction).
+tail:
+	TESTL	BX, BX
+	JEQ	move_0
+	CMPL	BX, $2
+	JBE	move_1or2
+	CMPL	BX, $4
+	JBE	move_3or4
+	CMPL	BX, $8
+	JBE	move_5through8
+	CMPL	BX, $16
+	JBE	move_9through16
+	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
+	JEQ	nosse2
+	CMPL	BX, $32
+	JBE	move_17through32
+	CMPL	BX, $64
+	JBE	move_33through64
+	CMPL	BX, $128
+	JBE	move_65through128
+	// TODO: use branch table and BSR to make this just a single dispatch
+
+nosse2:
+/*
+ * check and set for backwards
+ */
+	CMPL	SI, DI
+	JLS	back
+
+/*
+ * forward copy loop
+ */
+forward:	
+	MOVL	BX, CX
+	SHRL	$2, CX
+	ANDL	$3, BX
+
+	REP;	MOVSL
+	JMP	tail
+/*
+ * check overlap
+ */
+back:
+	MOVL	SI, CX
+	ADDL	BX, CX
+	CMPL	CX, DI
+	JLS	forward
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+
+	ADDL	BX, DI
+	ADDL	BX, SI
+	STD
+
+/*
+ * copy
+ */
+	MOVL	BX, CX
+	SHRL	$2, CX
+	ANDL	$3, BX
+
+	SUBL	$4, DI
+	SUBL	$4, SI
+	REP;	MOVSL
+
+	CLD
+	ADDL	$4, DI
+	ADDL	$4, SI
+	SUBL	BX, DI
+	SUBL	BX, SI
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	AX, (DI)
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	AX, (DI)
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through8:
+	MOVL	(SI), AX
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	AX, (DI)
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_9through16:
+	MOVL	(SI), AX
+	MOVL	4(SI), CX
+	MOVL	-8(SI)(BX*1), DX
+	MOVL	-4(SI)(BX*1), BP
+	MOVL	AX, (DI)
+	MOVL	CX, 4(DI)
+	MOVL	DX, -8(DI)(BX*1)
+	MOVL	BP, -4(DI)(BX*1)
+	RET
+move_17through32:
+	MOVOU	(SI), X0
+	MOVOU	-16(SI)(BX*1), X1
+	MOVOU	X0, (DI)
+	MOVOU	X1, -16(DI)(BX*1)
+	RET
+move_33through64:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	-32(SI)(BX*1), X2
+	MOVOU	-16(SI)(BX*1), X3
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, -32(DI)(BX*1)
+	MOVOU	X3, -16(DI)(BX*1)
+	RET
+move_65through128:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	-64(SI)(BX*1), X4
+	MOVOU	-48(SI)(BX*1), X5
+	MOVOU	-32(SI)(BX*1), X6
+	MOVOU	-16(SI)(BX*1), X7
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, -64(DI)(BX*1)
+	MOVOU	X5, -48(DI)(BX*1)
+	MOVOU	X6, -32(DI)(BX*1)
+	MOVOU	X7, -16(DI)(BX*1)
+	RET
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
new file mode 100644
index 000000000..f96843534
--- /dev/null
+++ b/src/runtime/memmove_amd64.s
@@ -0,0 +1,252 @@
+// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
+// http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// +build !plan9
+
+#include "textflag.h"
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove(SB), NOSPLIT, $0-24
+
+	MOVQ	to+0(FP), DI
+	MOVQ	from+8(FP), SI
+	MOVQ	n+16(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code.  The REP MOVSQ instruction is really fast
+	// for large sizes.  The cutover is approximately 2K.
+tail:
+	// move_129through256 or smaller work whether or not the source and the
+	// destination memory regions overlap because they load all data into
+	// registers before writing it back.  move_256through2048 on the other
+	// hand can be used only when the memory regions don't overlap or the copy
+	// direction is forward.
+	TESTQ	BX, BX
+	JEQ	move_0
+	CMPQ	BX, $2
+	JBE	move_1or2
+	CMPQ	BX, $4
+	JBE	move_3or4
+	CMPQ	BX, $8
+	JBE	move_5through8
+	CMPQ	BX, $16
+	JBE	move_9through16
+	CMPQ	BX, $32
+	JBE	move_17through32
+	CMPQ	BX, $64
+	JBE	move_33through64
+	CMPQ	BX, $128
+	JBE	move_65through128
+	CMPQ	BX, $256
+	JBE	move_129through256
+	// TODO: use branch table and BSR to make this just a single dispatch
+
+/*
+ * check and set for backwards
+ */
+	CMPQ	SI, DI
+	JLS	back
+
+/*
+ * forward copy loop
+ */
+forward:
+	CMPQ	BX, $2048
+	JLS	move_256through2048
+
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+	REP;	MOVSQ
+	JMP	tail
+
+back:
+/*
+ * check overlap
+ */
+	MOVQ	SI, CX
+	ADDQ	BX, CX
+	CMPQ	CX, DI
+	JLS	forward
+	
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+
+/*
+ * copy
+ */
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+
+	SUBQ	$8, DI
+	SUBQ	$8, SI
+	REP;	MOVSQ
+
+	CLD
+	ADDQ	$8, DI
+	ADDQ	$8, SI
+	SUBQ	BX, DI
+	SUBQ	BX, SI
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	AX, (DI)
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	AX, (DI)
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through8:
+	MOVL	(SI), AX
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	AX, (DI)
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_9through16:
+	MOVQ	(SI), AX
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	AX, (DI)
+	MOVQ	CX, -8(DI)(BX*1)
+	RET
+move_17through32:
+	MOVOU	(SI), X0
+	MOVOU	-16(SI)(BX*1), X1
+	MOVOU	X0, (DI)
+	MOVOU	X1, -16(DI)(BX*1)
+	RET
+move_33through64:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	-32(SI)(BX*1), X2
+	MOVOU	-16(SI)(BX*1), X3
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, -32(DI)(BX*1)
+	MOVOU	X3, -16(DI)(BX*1)
+	RET
+move_65through128:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	-64(SI)(BX*1), X4
+	MOVOU	-48(SI)(BX*1), X5
+	MOVOU	-32(SI)(BX*1), X6
+	MOVOU	-16(SI)(BX*1), X7
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, -64(DI)(BX*1)
+	MOVOU	X5, -48(DI)(BX*1)
+	MOVOU	X6, -32(DI)(BX*1)
+	MOVOU	X7, -16(DI)(BX*1)
+	RET
+move_129through256:
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	64(SI), X4
+	MOVOU	80(SI), X5
+	MOVOU	96(SI), X6
+	MOVOU	112(SI), X7
+	MOVOU	-128(SI)(BX*1), X8
+	MOVOU	-112(SI)(BX*1), X9
+	MOVOU	-96(SI)(BX*1), X10
+	MOVOU	-80(SI)(BX*1), X11
+	MOVOU	-64(SI)(BX*1), X12
+	MOVOU	-48(SI)(BX*1), X13
+	MOVOU	-32(SI)(BX*1), X14
+	MOVOU	-16(SI)(BX*1), X15
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, 64(DI)
+	MOVOU	X5, 80(DI)
+	MOVOU	X6, 96(DI)
+	MOVOU	X7, 112(DI)
+	MOVOU	X8, -128(DI)(BX*1)
+	MOVOU	X9, -112(DI)(BX*1)
+	MOVOU	X10, -96(DI)(BX*1)
+	MOVOU	X11, -80(DI)(BX*1)
+	MOVOU	X12, -64(DI)(BX*1)
+	MOVOU	X13, -48(DI)(BX*1)
+	MOVOU	X14, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
+	RET
+move_256through2048:
+	SUBQ	$256, BX
+	MOVOU	(SI), X0
+	MOVOU	16(SI), X1
+	MOVOU	32(SI), X2
+	MOVOU	48(SI), X3
+	MOVOU	64(SI), X4
+	MOVOU	80(SI), X5
+	MOVOU	96(SI), X6
+	MOVOU	112(SI), X7
+	MOVOU	128(SI), X8
+	MOVOU	144(SI), X9
+	MOVOU	160(SI), X10
+	MOVOU	176(SI), X11
+	MOVOU	192(SI), X12
+	MOVOU	208(SI), X13
+	MOVOU	224(SI), X14
+	MOVOU	240(SI), X15
+	MOVOU	X0, (DI)
+	MOVOU	X1, 16(DI)
+	MOVOU	X2, 32(DI)
+	MOVOU	X3, 48(DI)
+	MOVOU	X4, 64(DI)
+	MOVOU	X5, 80(DI)
+	MOVOU	X6, 96(DI)
+	MOVOU	X7, 112(DI)
+	MOVOU	X8, 128(DI)
+	MOVOU	X9, 144(DI)
+	MOVOU	X10, 160(DI)
+	MOVOU	X11, 176(DI)
+	MOVOU	X12, 192(DI)
+	MOVOU	X13, 208(DI)
+	MOVOU	X14, 224(DI)
+	MOVOU	X15, 240(DI)
+	CMPQ	BX, $256
+	LEAQ	256(SI), SI
+	LEAQ	256(DI), DI
+	JGE	move_256through2048
+	JMP	tail
diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s
new file mode 100644
index 000000000..f187d4267
--- /dev/null
+++ b/src/runtime/memmove_arm.s
@@ -0,0 +1,261 @@
+// Inferno's libkern/memmove-arm.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/memmove-arm.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+// TE or TS are spilled to the stack during bulk register moves.
+TS = 0
+TE = 8
+
+// Warning: the linker will use R11 to synthesize certain instructions. Please
+// take care and double check with objdump.
+FROM = 11
+N = 12
+TMP = 12				/* N and TMP don't overlap */
+TMP1 = 5
+
+RSHIFT = 5
+LSHIFT = 6
+OFFSET = 7
+
+BR0 = 0					/* shared with TS */
+BW0 = 1
+BR1 = 1
+BW1 = 2
+BR2 = 2
+BW2 = 3
+BR3 = 3
+BW3 = 4
+
+FW0 = 1
+FR0 = 2
+FW1 = 2
+FR1 = 3
+FW2 = 3
+FR2 = 4
+FW3 = 4
+FR3 = 8					/* shared with TE */
+
+TEXT runtime·memmove(SB), NOSPLIT, $4-12
+_memmove:
+	MOVW	to+0(FP), R(TS)
+	MOVW	from+4(FP), R(FROM)
+	MOVW	n+8(FP), R(N)
+
+	ADD	R(N), R(TS), R(TE)	/* to end pointer */
+
+	CMP	R(FROM), R(TS)
+	BLS	_forward
+
+_back:
+	ADD	R(N), R(FROM)		/* from end pointer */
+	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	BLT	_b1tail
+
+_b4align:				/* align destination on 4 */
+	AND.S	$3, R(TE), R(TMP)
+	BEQ	_b4aligned
+
+	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
+	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
+	B	_b4align
+
+_b4aligned:				/* is source now aligned? */
+	AND.S	$3, R(FROM), R(TMP)
+	BNE	_bunaligned
+
+	ADD	$31, R(TS), R(TMP)	/* do 32-byte chunks if possible */
+	MOVW	R(TS), savedts-4(SP)
+_b32loop:
+	CMP	R(TMP), R(TE)
+	BLS	_b4tail
+
+	MOVM.DB.W (R(FROM)), [R0-R7]
+	MOVM.DB.W [R0-R7], (R(TE))
+	B	_b32loop
+
+_b4tail:				/* do remaining words if possible */
+	MOVW	savedts-4(SP), R(TS)
+	ADD	$3, R(TS), R(TMP)
+_b4loop:
+	CMP	R(TMP), R(TE)
+	BLS	_b1tail
+
+	MOVW.W	-4(R(FROM)), R(TMP1)	/* pre-indexed */
+	MOVW.W	R(TMP1), -4(R(TE))	/* pre-indexed */
+	B	_b4loop
+
+_b1tail:				/* remaining bytes */
+	CMP	R(TE), R(TS)
+	BEQ	_return
+
+	MOVBU.W	-1(R(FROM)), R(TMP)	/* pre-indexed */
+	MOVBU.W	R(TMP), -1(R(TE))	/* pre-indexed */
+	B	_b1tail
+
+_forward:
+	CMP	$4, R(N)		/* need at least 4 bytes to copy */
+	BLT	_f1tail
+
+_f4align:				/* align destination on 4 */
+	AND.S	$3, R(TS), R(TMP)
+	BEQ	_f4aligned
+
+	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
+	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
+	B	_f4align
+
+_f4aligned:				/* is source now aligned? */
+	AND.S	$3, R(FROM), R(TMP)
+	BNE	_funaligned
+
+	SUB	$31, R(TE), R(TMP)	/* do 32-byte chunks if possible */
+	MOVW	R(TE), savedte-4(SP)
+_f32loop:
+	CMP	R(TMP), R(TS)
+	BHS	_f4tail
+
+	MOVM.IA.W (R(FROM)), [R1-R8] 
+	MOVM.IA.W [R1-R8], (R(TS))
+	B	_f32loop
+
+_f4tail:
+	MOVW	savedte-4(SP), R(TE)
+	SUB	$3, R(TE), R(TMP)	/* do remaining words if possible */
+_f4loop:
+	CMP	R(TMP), R(TS)
+	BHS	_f1tail
+
+	MOVW.P	4(R(FROM)), R(TMP1)	/* implicit write back */
+	MOVW.P	R(TMP1), 4(R(TS))	/* implicit write back */
+	B	_f4loop
+
+_f1tail:
+	CMP	R(TS), R(TE)
+	BEQ	_return
+
+	MOVBU.P	1(R(FROM)), R(TMP)	/* implicit write back */
+	MOVBU.P	R(TMP), 1(R(TS))	/* implicit write back */
+	B	_f1tail
+
+_return:
+	MOVW	to+0(FP), R0
+	RET
+
+_bunaligned:
+	CMP	$2, R(TMP)		/* is R(TMP) < 2 ? */
+
+	MOVW.LT	$8, R(RSHIFT)		/* (R(n)<<24)|(R(n-1)>>8) */
+	MOVW.LT	$24, R(LSHIFT)
+	MOVW.LT	$1, R(OFFSET)
+
+	MOVW.EQ	$16, R(RSHIFT)		/* (R(n)<<16)|(R(n-1)>>16) */
+	MOVW.EQ	$16, R(LSHIFT)
+	MOVW.EQ	$2, R(OFFSET)
+
+	MOVW.GT	$24, R(RSHIFT)		/* (R(n)<<8)|(R(n-1)>>24) */
+	MOVW.GT	$8, R(LSHIFT)
+	MOVW.GT	$3, R(OFFSET)
+
+	ADD	$16, R(TS), R(TMP)	/* do 16-byte chunks if possible */
+	CMP	R(TMP), R(TE)
+	BLS	_b1tail
+
+	BIC	$3, R(FROM)		/* align source */
+	MOVW	R(TS), savedts-4(SP)
+	MOVW	(R(FROM)), R(BR0)	/* prime first block register */
+
+_bu16loop:
+	CMP	R(TMP), R(TE)
+	BLS	_bu1tail
+
+	MOVW	R(BR0)<<R(LSHIFT), R(BW3)
+	MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)]
+	ORR	R(BR3)>>R(RSHIFT), R(BW3)
+
+	MOVW	R(BR3)<<R(LSHIFT), R(BW2)
+	ORR	R(BR2)>>R(RSHIFT), R(BW2)
+
+	MOVW	R(BR2)<<R(LSHIFT), R(BW1)
+	ORR	R(BR1)>>R(RSHIFT), R(BW1)
+
+	MOVW	R(BR1)<<R(LSHIFT), R(BW0)
+	ORR	R(BR0)>>R(RSHIFT), R(BW0)
+
+	MOVM.DB.W [R(BW0)-R(BW3)], (R(TE))
+	B	_bu16loop
+
+_bu1tail:
+	MOVW	savedts-4(SP), R(TS)
+	ADD	R(OFFSET), R(FROM)
+	B	_b1tail
+
+_funaligned:
+	CMP	$2, R(TMP)
+
+	MOVW.LT	$8, R(RSHIFT)		/* (R(n+1)<<24)|(R(n)>>8) */
+	MOVW.LT	$24, R(LSHIFT)
+	MOVW.LT	$3, R(OFFSET)
+
+	MOVW.EQ	$16, R(RSHIFT)		/* (R(n+1)<<16)|(R(n)>>16) */
+	MOVW.EQ	$16, R(LSHIFT)
+	MOVW.EQ	$2, R(OFFSET)
+
+	MOVW.GT	$24, R(RSHIFT)		/* (R(n+1)<<8)|(R(n)>>24) */
+	MOVW.GT	$8, R(LSHIFT)
+	MOVW.GT	$1, R(OFFSET)
+
+	SUB	$16, R(TE), R(TMP)	/* do 16-byte chunks if possible */
+	CMP	R(TMP), R(TS)
+	BHS	_f1tail
+
+	BIC	$3, R(FROM)		/* align source */
+	MOVW	R(TE), savedte-4(SP)
+	MOVW.P	4(R(FROM)), R(FR3)	/* prime last block register, implicit write back */
+
+_fu16loop:
+	CMP	R(TMP), R(TS)
+	BHS	_fu1tail
+
+	MOVW	R(FR3)>>R(RSHIFT), R(FW0)
+	MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
+	ORR	R(FR0)<<R(LSHIFT), R(FW0)
+
+	MOVW	R(FR0)>>R(RSHIFT), R(FW1)
+	ORR	R(FR1)<<R(LSHIFT), R(FW1)
+
+	MOVW	R(FR1)>>R(RSHIFT), R(FW2)
+	ORR	R(FR2)<<R(LSHIFT), R(FW2)
+
+	MOVW	R(FR2)>>R(RSHIFT), R(FW3)
+	ORR	R(FR3)<<R(LSHIFT), R(FW3)
+
+	MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
+	B	_fu16loop
+
+_fu1tail:
+	MOVW	savedte-4(SP), R(TE)
+	SUB	R(OFFSET), R(FROM)
+	B	_f1tail
diff --git a/src/runtime/memmove_linux_amd64_test.go b/src/runtime/memmove_linux_amd64_test.go
new file mode 100644
index 000000000..f7221f4f5
--- /dev/null
+++ b/src/runtime/memmove_linux_amd64_test.go
@@ -0,0 +1,61 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"io/ioutil"
+	"os"
+	"reflect"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+// TestMemmoveOverflow maps 3GB of memory and calls memmove on
+// the corresponding slice.
+func TestMemmoveOverflow(t *testing.T) {
+	// Create a temporary file.
+	tmp, err := ioutil.TempFile("", "go-memmovetest")
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = tmp.Write(make([]byte, 65536))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Remove(tmp.Name())
+	defer tmp.Close()
+
+	// Set up mappings.
+	base, _, errno := syscall.Syscall6(syscall.SYS_MMAP,
+		0xa0<<32, 3<<30, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_PRIVATE|syscall.MAP_ANONYMOUS, ^uintptr(0), 0)
+	if errno != 0 {
+		t.Skipf("could not create memory mapping: %s", errno)
+	}
+	syscall.Syscall(syscall.SYS_MUNMAP, base, 3<<30, 0)
+
+	for off := uintptr(0); off < 3<<30; off += 65536 {
+		_, _, errno := syscall.Syscall6(syscall.SYS_MMAP,
+			base+off, 65536, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FIXED, tmp.Fd(), 0)
+		if errno != 0 {
+			t.Fatalf("could not map a page at requested 0x%x: %s", base+off, errno)
+		}
+		defer syscall.Syscall(syscall.SYS_MUNMAP, base+off, 65536, 0)
+	}
+
+	var s []byte
+	sp := (*reflect.SliceHeader)(unsafe.Pointer(&s))
+	sp.Data = base
+	sp.Len, sp.Cap = 3<<30, 3<<30
+
+	n := copy(s[1:], s)
+	if n != 3<<30-1 {
+		t.Fatalf("copied %d bytes, expected %d", n, 3<<30-1)
+	}
+	n = copy(s, s[1:])
+	if n != 3<<30-1 {
+		t.Fatalf("copied %d bytes, expected %d", n, 3<<30-1)
+	}
+}
diff --git a/src/runtime/memmove_nacl_amd64p32.s b/src/runtime/memmove_nacl_amd64p32.s
new file mode 100644
index 000000000..373607afe
--- /dev/null
+++ b/src/runtime/memmove_nacl_amd64p32.s
@@ -0,0 +1,46 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime·memmove(SB), NOSPLIT, $0-12
+	MOVL	to+0(FP), DI
+	MOVL	from+4(FP), SI
+	MOVL	n+8(FP), BX
+
+	CMPL	SI, DI
+	JLS back
+
+forward:
+	MOVL	BX, CX
+	SHRL	$3, CX
+	ANDL	$7, BX
+	REP; MOVSQ
+	MOVL	BX, CX
+	REP; MOVSB
+	RET
+
+back:
+	MOVL	SI, CX
+	ADDL	BX, CX
+	CMPL	CX, DI
+	JLS forward
+
+	ADDL	BX, DI
+	ADDL	BX, SI
+	STD
+	
+	MOVL	BX, CX
+	SHRL	$3, CX
+	ANDL	$7, BX
+	SUBL	$8, DI
+	SUBL	$8, SI
+	REP; MOVSQ
+	ADDL	$7, DI
+	ADDL	$7, SI
+	MOVL	BX, CX
+	REP; MOVSB
+	CLD
+
+	RET
diff --git a/src/runtime/memmove_plan9_386.s b/src/runtime/memmove_plan9_386.s
new file mode 100644
index 000000000..025d4ce1b
--- /dev/null
+++ b/src/runtime/memmove_plan9_386.s
@@ -0,0 +1,128 @@
+// Inferno's libkern/memmove-386.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+TEXT runtime·memmove(SB), NOSPLIT, $0-12
+	MOVL	to+0(FP), DI
+	MOVL	from+4(FP), SI
+	MOVL	n+8(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code.  The REP MOVSL instruction is really fast
+	// for large sizes.  The cutover is approximately 1K.
+tail:
+	TESTL	BX, BX
+	JEQ	move_0
+	CMPL	BX, $2
+	JBE	move_1or2
+	CMPL	BX, $4
+	JBE	move_3or4
+	CMPL	BX, $8
+	JBE	move_5through8
+	CMPL	BX, $16
+	JBE	move_9through16
+
+/*
+ * check and set for backwards
+ */
+	CMPL	SI, DI
+	JLS	back
+
+/*
+ * forward copy loop
+ */
+forward:	
+	MOVL	BX, CX
+	SHRL	$2, CX
+	ANDL	$3, BX
+
+	REP;	MOVSL
+	JMP	tail
+/*
+ * check overlap
+ */
+back:
+	MOVL	SI, CX
+	ADDL	BX, CX
+	CMPL	CX, DI
+	JLS	forward
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+
+	ADDL	BX, DI
+	ADDL	BX, SI
+	STD
+
+/*
+ * copy
+ */
+	MOVL	BX, CX
+	SHRL	$2, CX
+	ANDL	$3, BX
+
+	SUBL	$4, DI
+	SUBL	$4, SI
+	REP;	MOVSL
+
+	CLD
+	ADDL	$4, DI
+	ADDL	$4, SI
+	SUBL	BX, DI
+	SUBL	BX, SI
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	AX, (DI)
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	AX, (DI)
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through8:
+	MOVL	(SI), AX
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	AX, (DI)
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_9through16:
+	MOVL	(SI), AX
+	MOVL	4(SI), CX
+	MOVL	-8(SI)(BX*1), DX
+	MOVL	-4(SI)(BX*1), BP
+	MOVL	AX, (DI)
+	MOVL	CX, 4(DI)
+	MOVL	DX, -8(DI)(BX*1)
+	MOVL	BP, -4(DI)(BX*1)
+	RET
diff --git a/src/runtime/memmove_plan9_amd64.s b/src/runtime/memmove_plan9_amd64.s
new file mode 100644
index 000000000..8e96b8717
--- /dev/null
+++ b/src/runtime/memmove_plan9_amd64.s
@@ -0,0 +1,127 @@
+// Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
+// http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove(SB), NOSPLIT, $0-24
+
+	MOVQ	to+0(FP), DI
+	MOVQ	from+8(FP), SI
+	MOVQ	n+16(FP), BX
+
+	// REP instructions have a high startup cost, so we handle small sizes
+	// with some straightline code.  The REP MOVSQ instruction is really fast
+	// for large sizes.  The cutover is approximately 1K.
+tail:
+	TESTQ	BX, BX
+	JEQ	move_0
+	CMPQ	BX, $2
+	JBE	move_1or2
+	CMPQ	BX, $4
+	JBE	move_3or4
+	CMPQ	BX, $8
+	JBE	move_5through8
+	CMPQ	BX, $16
+	JBE	move_9through16
+
+/*
+ * check and set for backwards
+ */
+	CMPQ	SI, DI
+	JLS	back
+
+/*
+ * forward copy loop
+ */
+forward:
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+
+	REP;	MOVSQ
+	JMP	tail
+
+back:
+/*
+ * check overlap
+ */
+	MOVQ	SI, CX
+	ADDQ	BX, CX
+	CMPQ	CX, DI
+	JLS	forward
+	
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+	ADDQ	BX, DI
+	ADDQ	BX, SI
+	STD
+
+/*
+ * copy
+ */
+	MOVQ	BX, CX
+	SHRQ	$3, CX
+	ANDQ	$7, BX
+
+	SUBQ	$8, DI
+	SUBQ	$8, SI
+	REP;	MOVSQ
+
+	CLD
+	ADDQ	$8, DI
+	ADDQ	$8, SI
+	SUBQ	BX, DI
+	SUBQ	BX, SI
+	JMP	tail
+
+move_1or2:
+	MOVB	(SI), AX
+	MOVB	-1(SI)(BX*1), CX
+	MOVB	AX, (DI)
+	MOVB	CX, -1(DI)(BX*1)
+	RET
+move_0:
+	RET
+move_3or4:
+	MOVW	(SI), AX
+	MOVW	-2(SI)(BX*1), CX
+	MOVW	AX, (DI)
+	MOVW	CX, -2(DI)(BX*1)
+	RET
+move_5through8:
+	MOVL	(SI), AX
+	MOVL	-4(SI)(BX*1), CX
+	MOVL	AX, (DI)
+	MOVL	CX, -4(DI)(BX*1)
+	RET
+move_9through16:
+	MOVQ	(SI), AX
+	MOVQ	-8(SI)(BX*1), CX
+	MOVQ	AX, (DI)
+	MOVQ	CX, -8(DI)(BX*1)
+	RET
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
new file mode 100644
index 000000000..ffda4fe6c
--- /dev/null
+++ b/src/runtime/memmove_test.go
@@ -0,0 +1,295 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+func TestMemmove(t *testing.T) {
+	size := 256
+	if testing.Short() {
+		size = 128 + 16
+	}
+	src := make([]byte, size)
+	dst := make([]byte, size)
+	for i := 0; i < size; i++ {
+		src[i] = byte(128 + (i & 127))
+	}
+	for i := 0; i < size; i++ {
+		dst[i] = byte(i & 127)
+	}
+	for n := 0; n <= size; n++ {
+		for x := 0; x <= size-n; x++ { // offset in src
+			for y := 0; y <= size-n; y++ { // offset in dst
+				copy(dst[y:y+n], src[x:x+n])
+				for i := 0; i < y; i++ {
+					if dst[i] != byte(i&127) {
+						t.Fatalf("prefix dst[%d] = %d", i, dst[i])
+					}
+				}
+				for i := y; i < y+n; i++ {
+					if dst[i] != byte(128+((i-y+x)&127)) {
+						t.Fatalf("copied dst[%d] = %d", i, dst[i])
+					}
+					dst[i] = byte(i & 127) // reset dst
+				}
+				for i := y + n; i < size; i++ {
+					if dst[i] != byte(i&127) {
+						t.Fatalf("suffix dst[%d] = %d", i, dst[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func TestMemmoveAlias(t *testing.T) {
+	size := 256
+	if testing.Short() {
+		size = 128 + 16
+	}
+	buf := make([]byte, size)
+	for i := 0; i < size; i++ {
+		buf[i] = byte(i)
+	}
+	for n := 0; n <= size; n++ {
+		for x := 0; x <= size-n; x++ { // src offset
+			for y := 0; y <= size-n; y++ { // dst offset
+				copy(buf[y:y+n], buf[x:x+n])
+				for i := 0; i < y; i++ {
+					if buf[i] != byte(i) {
+						t.Fatalf("prefix buf[%d] = %d", i, buf[i])
+					}
+				}
+				for i := y; i < y+n; i++ {
+					if buf[i] != byte(i-y+x) {
+						t.Fatalf("copied buf[%d] = %d", i, buf[i])
+					}
+					buf[i] = byte(i) // reset buf
+				}
+				for i := y + n; i < size; i++ {
+					if buf[i] != byte(i) {
+						t.Fatalf("suffix buf[%d] = %d", i, buf[i])
+					}
+				}
+			}
+		}
+	}
+}
+
+func bmMemmove(b *testing.B, n int) {
+	x := make([]byte, n)
+	y := make([]byte, n)
+	b.SetBytes(int64(n))
+	for i := 0; i < b.N; i++ {
+		copy(x, y)
+	}
+}
+
+func BenchmarkMemmove0(b *testing.B)    { bmMemmove(b, 0) }
+func BenchmarkMemmove1(b *testing.B)    { bmMemmove(b, 1) }
+func BenchmarkMemmove2(b *testing.B)    { bmMemmove(b, 2) }
+func BenchmarkMemmove3(b *testing.B)    { bmMemmove(b, 3) }
+func BenchmarkMemmove4(b *testing.B)    { bmMemmove(b, 4) }
+func BenchmarkMemmove5(b *testing.B)    { bmMemmove(b, 5) }
+func BenchmarkMemmove6(b *testing.B)    { bmMemmove(b, 6) }
+func BenchmarkMemmove7(b *testing.B)    { bmMemmove(b, 7) }
+func BenchmarkMemmove8(b *testing.B)    { bmMemmove(b, 8) }
+func BenchmarkMemmove9(b *testing.B)    { bmMemmove(b, 9) }
+func BenchmarkMemmove10(b *testing.B)   { bmMemmove(b, 10) }
+func BenchmarkMemmove11(b *testing.B)   { bmMemmove(b, 11) }
+func BenchmarkMemmove12(b *testing.B)   { bmMemmove(b, 12) }
+func BenchmarkMemmove13(b *testing.B)   { bmMemmove(b, 13) }
+func BenchmarkMemmove14(b *testing.B)   { bmMemmove(b, 14) }
+func BenchmarkMemmove15(b *testing.B)   { bmMemmove(b, 15) }
+func BenchmarkMemmove16(b *testing.B)   { bmMemmove(b, 16) }
+func BenchmarkMemmove32(b *testing.B)   { bmMemmove(b, 32) }
+func BenchmarkMemmove64(b *testing.B)   { bmMemmove(b, 64) }
+func BenchmarkMemmove128(b *testing.B)  { bmMemmove(b, 128) }
+func BenchmarkMemmove256(b *testing.B)  { bmMemmove(b, 256) }
+func BenchmarkMemmove512(b *testing.B)  { bmMemmove(b, 512) }
+func BenchmarkMemmove1024(b *testing.B) { bmMemmove(b, 1024) }
+func BenchmarkMemmove2048(b *testing.B) { bmMemmove(b, 2048) }
+func BenchmarkMemmove4096(b *testing.B) { bmMemmove(b, 4096) }
+
+func TestMemclr(t *testing.T) {
+	size := 512
+	if testing.Short() {
+		size = 128 + 16
+	}
+	mem := make([]byte, size)
+	for i := 0; i < size; i++ {
+		mem[i] = 0xee
+	}
+	for n := 0; n < size; n++ {
+		for x := 0; x <= size-n; x++ { // offset in mem
+			MemclrBytes(mem[x : x+n])
+			for i := 0; i < x; i++ {
+				if mem[i] != 0xee {
+					t.Fatalf("overwrite prefix mem[%d] = %d", i, mem[i])
+				}
+			}
+			for i := x; i < x+n; i++ {
+				if mem[i] != 0 {
+					t.Fatalf("failed clear mem[%d] = %d", i, mem[i])
+				}
+				mem[i] = 0xee
+			}
+			for i := x + n; i < size; i++ {
+				if mem[i] != 0xee {
+					t.Fatalf("overwrite suffix mem[%d] = %d", i, mem[i])
+				}
+			}
+		}
+	}
+}
+
+func bmMemclr(b *testing.B, n int) {
+	x := make([]byte, n)
+	b.SetBytes(int64(n))
+	for i := 0; i < b.N; i++ {
+		MemclrBytes(x)
+	}
+}
+func BenchmarkMemclr5(b *testing.B)     { bmMemclr(b, 5) }
+func BenchmarkMemclr16(b *testing.B)    { bmMemclr(b, 16) }
+func BenchmarkMemclr64(b *testing.B)    { bmMemclr(b, 64) }
+func BenchmarkMemclr256(b *testing.B)   { bmMemclr(b, 256) }
+func BenchmarkMemclr4096(b *testing.B)  { bmMemclr(b, 4096) }
+func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
+
+func BenchmarkClearFat8(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [8 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat12(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [12 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat16(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [16 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat24(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [24 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat32(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [32 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat64(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [64 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat128(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [128 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat256(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [256 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat512(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [512 / 4]uint32
+		_ = x
+	}
+}
+func BenchmarkClearFat1024(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		var x [1024 / 4]uint32
+		_ = x
+	}
+}
+
+func BenchmarkCopyFat8(b *testing.B) {
+	var x [8 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat12(b *testing.B) {
+	var x [12 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat16(b *testing.B) {
+	var x [16 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat24(b *testing.B) {
+	var x [24 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat32(b *testing.B) {
+	var x [32 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat64(b *testing.B) {
+	var x [64 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat128(b *testing.B) {
+	var x [128 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat256(b *testing.B) {
+	var x [256 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat512(b *testing.B) {
+	var x [512 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
+func BenchmarkCopyFat1024(b *testing.B) {
+	var x [1024 / 4]uint32
+	for i := 0; i < b.N; i++ {
+		y := x
+		_ = y
+	}
+}
diff --git a/src/runtime/mfinal_test.go b/src/runtime/mfinal_test.go
new file mode 100644
index 000000000..d2cead287
--- /dev/null
+++ b/src/runtime/mfinal_test.go
@@ -0,0 +1,246 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+type Tintptr *int // assignable to *int
+type Tint int     // *Tint implements Tinter, interface{}
+
+func (t *Tint) m() {}
+
+type Tinter interface {
+	m()
+}
+
+func TestFinalizerType(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		t.Skipf("Skipping on non-amd64 machine")
+	}
+
+	ch := make(chan bool, 10)
+	finalize := func(x *int) {
+		if *x != 97531 {
+			t.Errorf("finalizer %d, want %d", *x, 97531)
+		}
+		ch <- true
+	}
+
+	var finalizerTests = []struct {
+		convert   func(*int) interface{}
+		finalizer interface{}
+	}{
+		{func(x *int) interface{} { return x }, func(v *int) { finalize(v) }},
+		{func(x *int) interface{} { return Tintptr(x) }, func(v Tintptr) { finalize(v) }},
+		{func(x *int) interface{} { return Tintptr(x) }, func(v *int) { finalize(v) }},
+		{func(x *int) interface{} { return (*Tint)(x) }, func(v *Tint) { finalize((*int)(v)) }},
+		{func(x *int) interface{} { return (*Tint)(x) }, func(v Tinter) { finalize((*int)(v.(*Tint))) }},
+	}
+
+	for i, tt := range finalizerTests {
+		done := make(chan bool, 1)
+		go func() {
+			// allocate struct with pointer to avoid hitting tinyalloc.
+			// Otherwise we can't be sure when the allocation will
+			// be freed.
+			type T struct {
+				v int
+				p unsafe.Pointer
+			}
+			v := &new(T).v
+			*v = 97531
+			runtime.SetFinalizer(tt.convert(v), tt.finalizer)
+			v = nil
+			done <- true
+		}()
+		<-done
+		runtime.GC()
+		select {
+		case <-ch:
+		case <-time.After(time.Second * 4):
+			t.Errorf("#%d: finalizer for type %T didn't run", i, tt.finalizer)
+		}
+	}
+}
+
+type bigValue struct {
+	fill uint64
+	it   bool
+	up   string
+}
+
+func TestFinalizerInterfaceBig(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		t.Skipf("Skipping on non-amd64 machine")
+	}
+	ch := make(chan bool)
+	done := make(chan bool, 1)
+	go func() {
+		v := &bigValue{0xDEADBEEFDEADBEEF, true, "It matters not how strait the gate"}
+		old := *v
+		runtime.SetFinalizer(v, func(v interface{}) {
+			i, ok := v.(*bigValue)
+			if !ok {
+				t.Errorf("finalizer called with type %T, want *bigValue", v)
+			}
+			if *i != old {
+				t.Errorf("finalizer called with %+v, want %+v", *i, old)
+			}
+			close(ch)
+		})
+		v = nil
+		done <- true
+	}()
+	<-done
+	runtime.GC()
+	select {
+	case <-ch:
+	case <-time.After(4 * time.Second):
+		t.Errorf("finalizer for type *bigValue didn't run")
+	}
+}
+
+func fin(v *int) {
+}
+
+// Verify we don't crash at least. golang.org/issue/6857
+func TestFinalizerZeroSizedStruct(t *testing.T) {
+	type Z struct{}
+	z := new(Z)
+	runtime.SetFinalizer(z, func(*Z) {})
+}
+
+func BenchmarkFinalizer(b *testing.B) {
+	const Batch = 1000
+	b.RunParallel(func(pb *testing.PB) {
+		var data [Batch]*int
+		for i := 0; i < Batch; i++ {
+			data[i] = new(int)
+		}
+		for pb.Next() {
+			for i := 0; i < Batch; i++ {
+				runtime.SetFinalizer(data[i], fin)
+			}
+			for i := 0; i < Batch; i++ {
+				runtime.SetFinalizer(data[i], nil)
+			}
+		}
+	})
+}
+
+func BenchmarkFinalizerRun(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			v := new(int)
+			runtime.SetFinalizer(v, fin)
+		}
+	})
+}
+
+// One chunk must be exactly one sizeclass in size.
+// It should be a sizeclass not used much by others, so we
+// have a greater chance of finding adjacent ones.
+// size class 19: 320 byte objects, 25 per page, 1 page alloc at a time
+const objsize = 320
+
+type objtype [objsize]byte
+
+func adjChunks() (*objtype, *objtype) {
+	var s []*objtype
+
+	for {
+		c := new(objtype)
+		for _, d := range s {
+			if uintptr(unsafe.Pointer(c))+unsafe.Sizeof(*c) == uintptr(unsafe.Pointer(d)) {
+				return c, d
+			}
+			if uintptr(unsafe.Pointer(d))+unsafe.Sizeof(*c) == uintptr(unsafe.Pointer(c)) {
+				return d, c
+			}
+		}
+		s = append(s, c)
+	}
+}
+
+// Make sure an empty slice on the stack doesn't pin the next object in memory.
+func TestEmptySlice(t *testing.T) {
+	if true { // disable until bug 7564 is fixed.
+		return
+	}
+	x, y := adjChunks()
+
+	// the pointer inside xs points to y.
+	xs := x[objsize:] // change objsize to objsize-1 and the test passes
+
+	fin := make(chan bool, 1)
+	runtime.SetFinalizer(y, func(z *objtype) { fin <- true })
+	runtime.GC()
+	select {
+	case <-fin:
+	case <-time.After(4 * time.Second):
+		t.Errorf("finalizer of next object in memory didn't run")
+	}
+	xsglobal = xs // keep empty slice alive until here
+}
+
+var xsglobal []byte
+
+func adjStringChunk() (string, *objtype) {
+	b := make([]byte, objsize)
+	for {
+		s := string(b)
+		t := new(objtype)
+		p := *(*uintptr)(unsafe.Pointer(&s))
+		q := uintptr(unsafe.Pointer(t))
+		if p+objsize == q {
+			return s, t
+		}
+	}
+}
+
+// Make sure an empty string on the stack doesn't pin the next object in memory.
+func TestEmptyString(t *testing.T) {
+	x, y := adjStringChunk()
+
+	ss := x[objsize:] // change objsize to objsize-1 and the test passes
+	fin := make(chan bool, 1)
+	// set finalizer on string contents of y
+	runtime.SetFinalizer(y, func(z *objtype) { fin <- true })
+	runtime.GC()
+	select {
+	case <-fin:
+	case <-time.After(4 * time.Second):
+		t.Errorf("finalizer of next string in memory didn't run")
+	}
+	ssglobal = ss // keep 0-length string live until here
+}
+
+var ssglobal string
+
+// Test for issue 7656.
+func TestFinalizerOnGlobal(t *testing.T) {
+	runtime.SetFinalizer(Foo1, func(p *Object1) {})
+	runtime.SetFinalizer(Foo2, func(p *Object2) {})
+	runtime.SetFinalizer(Foo1, nil)
+	runtime.SetFinalizer(Foo2, nil)
+}
+
+type Object1 struct {
+	Something []byte
+}
+
+type Object2 struct {
+	Something byte
+}
+
+var (
+	Foo2 = &Object2{}
+	Foo1 = &Object1{}
+)
diff --git a/src/runtime/mfixalloc.c b/src/runtime/mfixalloc.c
new file mode 100644
index 000000000..d670629da
--- /dev/null
+++ b/src/runtime/mfixalloc.c
@@ -0,0 +1,64 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fixed-size object allocator.  Returned memory is not zeroed.
+//
+// See malloc.h for overview.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+
+// Initialize f to allocate objects of the given size,
+// using the allocator to obtain chunks of memory.
+void
+runtime·FixAlloc_Init(FixAlloc *f, uintptr size, void (*first)(void*, byte*), void *arg, uint64 *stat)
+{
+	f->size = size;
+	f->first = first;
+	f->arg = arg;
+	f->list = nil;
+	f->chunk = nil;
+	f->nchunk = 0;
+	f->inuse = 0;
+	f->stat = stat;
+}
+
+void*
+runtime·FixAlloc_Alloc(FixAlloc *f)
+{
+	void *v;
+	
+	if(f->size == 0) {
+		runtime·printf("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n");
+		runtime·throw("runtime: internal error");
+	}
+
+	if(f->list) {
+		v = f->list;
+		f->list = *(void**)f->list;
+		f->inuse += f->size;
+		return v;
+	}
+	if(f->nchunk < f->size) {
+		f->chunk = runtime·persistentalloc(FixAllocChunk, 0, f->stat);
+		f->nchunk = FixAllocChunk;
+	}
+	v = f->chunk;
+	if(f->first)
+		f->first(f->arg, v);
+	f->chunk += f->size;
+	f->nchunk -= f->size;
+	f->inuse += f->size;
+	return v;
+}
+
+void
+runtime·FixAlloc_Free(FixAlloc *f, void *p)
+{
+	f->inuse -= f->size;
+	*(void**)p = f->list;
+	f->list = p;
+}
+
diff --git a/src/runtime/mgc0.c b/src/runtime/mgc0.c
new file mode 100644
index 000000000..7754bad89
--- /dev/null
+++ b/src/runtime/mgc0.c
@@ -0,0 +1,2010 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector (GC).
+//
+// GC is:
+// - mark&sweep
+// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
+// - parallel (up to MaxGcproc threads)
+// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
+// - non-moving/non-compacting
+// - full (non-partial)
+//
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+//
+// Concurrent sweep.
+// The sweep phase proceeds concurrently with normal program execution.
+// The heap is swept span-by-span both lazily (when a goroutine needs another span)
+// and concurrently in a background goroutine (this helps programs that are not CPU bound).
+// However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
+// and so next_gc calculation is tricky and happens as follows.
+// At the end of the stop-the-world phase next_gc is conservatively set based on total
+// heap size; all spans are marked as "needs sweeping".
+// Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
+// The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
+// closer to the target value. However, this is not enough to avoid over-allocating memory.
+// Consider that a goroutine wants to allocate a new span for a large object and
+// there are no free swept spans, but there are small-object unswept spans.
+// If the goroutine naively allocates a new span, it can surpass the yet-unknown
+// target next_gc value. In order to prevent such cases (1) when a goroutine needs
+// to allocate a new small-object span, it sweeps small-object spans for the same
+// object size until it frees at least one object; (2) when a goroutine needs to
+// allocate large-object span from heap, it sweeps spans until it frees at least
+// that many pages into heap. Together these two measures ensure that we don't surpass
+// target next_gc value by a large margin. There is an exception: if a goroutine sweeps
+// and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
+// but there can still be other one-page unswept spans which could be combined into a two-page span.
+// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+// The finalizer goroutine is kicked off only when all spans are swept.
+// When the next GC starts, it sweeps all not-yet-swept spans (if any).
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "stack.h"
+#include "mgc0.h"
+#include "chan.h"
+#include "race.h"
+#include "type.h"
+#include "typekind.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+enum {
+	Debug		= 0,
+	DebugPtrs	= 0, // if 1, print trace of every pointer load during GC
+	ConcurrentSweep	= 1,
+
+	WorkbufSize	= 4*1024,
+	FinBlockSize	= 4*1024,
+	RootData	= 0,
+	RootBss		= 1,
+	RootFinalizers	= 2,
+	RootSpans	= 3,
+	RootFlushCaches = 4,
+	RootCount	= 5,
+};
+
+// ptrmask for an allocation containing a single pointer.
+static byte oneptr[] = {BitsPointer};
+
+// Initialized from $GOGC.  GOGC=off means no gc.
+extern int32 runtime·gcpercent;
+
+// Holding worldsema grants an M the right to try to stop the world.
+// The procedure is:
+//
+//	runtime·semacquire(&runtime·worldsema);
+//	m->gcing = 1;
+//	runtime·stoptheworld();
+//
+//	... do stuff ...
+//
+//	m->gcing = 0;
+//	runtime·semrelease(&runtime·worldsema);
+//	runtime·starttheworld();
+//
+uint32 runtime·worldsema = 1;
+
+typedef struct Workbuf Workbuf;
+struct Workbuf
+{
+	LFNode	node; // must be first
+	uintptr	nobj;
+	byte*	obj[(WorkbufSize-sizeof(LFNode)-sizeof(uintptr))/PtrSize];
+};
+
+extern byte runtime·data[];
+extern byte runtime·edata[];
+extern byte runtime·bss[];
+extern byte runtime·ebss[];
+
+extern byte runtime·gcdata[];
+extern byte runtime·gcbss[];
+
+Mutex	runtime·finlock;	// protects the following variables
+G*	runtime·fing;		// goroutine that runs finalizers
+FinBlock*	runtime·finq;	// list of finalizers that are to be executed
+FinBlock*	runtime·finc;	// cache of free blocks
+static byte finptrmask[FinBlockSize/PtrSize/PointersPerByte];
+bool	runtime·fingwait;
+bool	runtime·fingwake;
+FinBlock	*runtime·allfin;	// list of all blocks
+
+BitVector	runtime·gcdatamask;
+BitVector	runtime·gcbssmask;
+
+Mutex	runtime·gclock;
+
+static	uintptr	badblock[1024];
+static	int32	nbadblock;
+
+static Workbuf* getempty(Workbuf*);
+static Workbuf* getfull(Workbuf*);
+static void	putempty(Workbuf*);
+static Workbuf* handoff(Workbuf*);
+static void	gchelperstart(void);
+static void	flushallmcaches(void);
+static bool	scanframe(Stkframe *frame, void *unused);
+static void	scanstack(G *gp);
+static BitVector	unrollglobgcprog(byte *prog, uintptr size);
+
+void runtime·bgsweep(void);
+static FuncVal bgsweepv = {runtime·bgsweep};
+
+typedef struct WorkData WorkData;
+struct WorkData {
+	uint64	full;  // lock-free list of full blocks
+	uint64	empty; // lock-free list of empty blocks
+	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
+	uint32	nproc;
+	int64	tstart;
+	volatile uint32	nwait;
+	volatile uint32	ndone;
+	Note	alldone;
+	ParFor*	markfor;
+
+	// Copy of mheap.allspans for marker or sweeper.
+	MSpan**	spans;
+	uint32	nspan;
+};
+WorkData runtime·work;
+
+// Is _cgo_allocate linked into the binary?
+static bool
+have_cgo_allocate(void)
+{
+	extern	byte	go·weak·runtime·_cgo_allocate_internal[1];
+	return go·weak·runtime·_cgo_allocate_internal != nil;
+}
+
+// scanblock scans a block of n bytes starting at pointer b for references
+// to other objects, scanning any it finds recursively until there are no
+// unscanned objects left.  Instead of using an explicit recursion, it keeps
+// a work list in the Workbuf* structures and loops in the main function
+// body.  Keeping an explicit work list is easier on the stack allocator and
+// more efficient.
+static void
+scanblock(byte *b, uintptr n, byte *ptrmask)
+{
+	byte *obj, *obj0, *p, *arena_start, *arena_used, **wp, *scanbuf[8], *ptrbitp, *bitp;
+	uintptr i, j, nobj, size, idx, x, off, scanbufpos, bits, xbits, shift;
+	Workbuf *wbuf;
+	Iface *iface;
+	Eface *eface;
+	Type *typ;
+	MSpan *s;
+	pageID k;
+	bool keepworking;
+
+	// Cache memory arena parameters in local vars.
+	arena_start = runtime·mheap.arena_start;
+	arena_used = runtime·mheap.arena_used;
+
+	wbuf = getempty(nil);
+	nobj = wbuf->nobj;
+	wp = &wbuf->obj[nobj];
+	keepworking = b == nil;
+	scanbufpos = 0;
+	for(i = 0; i < nelem(scanbuf); i++)
+		scanbuf[i] = nil;
+
+	ptrbitp = nil;
+
+	// ptrmask can have 2 possible values:
+	// 1. nil - obtain pointer mask from GC bitmap.
+	// 2. pointer to a compact mask (for stacks and data).
+	if(b != nil)
+		goto scanobj;
+	for(;;) {
+		if(nobj == 0) {
+			// Out of work in workbuf.
+			// First, see is there is any work in scanbuf.
+			for(i = 0; i < nelem(scanbuf); i++) {
+				b = scanbuf[scanbufpos];
+				scanbuf[scanbufpos++] = nil;
+				scanbufpos %= nelem(scanbuf);
+				if(b != nil) {
+					n = arena_used - b; // scan until bitBoundary or BitsDead
+					ptrmask = nil; // use GC bitmap for pointer info
+					goto scanobj;
+				}
+			}
+			if(!keepworking) {
+				putempty(wbuf);
+				return;
+			}
+			// Refill workbuf from global queue.
+			wbuf = getfull(wbuf);
+			if(wbuf == nil)
+				return;
+			nobj = wbuf->nobj;
+			wp = &wbuf->obj[nobj];
+		}
+
+		// If another proc wants a pointer, give it some.
+		if(runtime·work.nwait > 0 && nobj > 4 && runtime·work.full == 0) {
+			wbuf->nobj = nobj;
+			wbuf = handoff(wbuf);
+			nobj = wbuf->nobj;
+			wp = &wbuf->obj[nobj];
+		}
+
+		wp--;
+		nobj--;
+		b = *wp;
+		n = arena_used - b; // scan until next bitBoundary or BitsDead
+		ptrmask = nil; // use GC bitmap for pointer info
+
+	scanobj:
+		if(DebugPtrs)
+			runtime·printf("scanblock %p +%p %p\n", b, n, ptrmask);
+		// Find bits of the beginning of the object.
+		if(ptrmask == nil) {
+			off = (uintptr*)b - (uintptr*)arena_start;
+			ptrbitp = arena_start - off/wordsPerBitmapByte - 1;
+		}
+		for(i = 0; i < n; i += PtrSize) {
+			obj = nil;
+			// Find bits for this word.
+			if(ptrmask == nil) {
+				// Check is we have reached end of span.
+				if((((uintptr)b+i)%PageSize) == 0 &&
+					runtime·mheap.spans[(b-arena_start)>>PageShift] != runtime·mheap.spans[(b+i-arena_start)>>PageShift])
+					break;
+				// Consult GC bitmap.
+				bits = *ptrbitp;
+
+				if(wordsPerBitmapByte != 2)
+					runtime·throw("alg doesn't work for wordsPerBitmapByte != 2");
+				j = ((uintptr)b+i)/PtrSize & 1;
+				ptrbitp -= j;
+				bits >>= gcBits*j;
+
+				if((bits&bitBoundary) != 0 && i != 0)
+					break; // reached beginning of the next object
+				bits = (bits>>2)&BitsMask;
+				if(bits == BitsDead)
+					break; // reached no-scan part of the object
+			} else // dense mask (stack or data)
+				bits = (ptrmask[(i/PtrSize)/4]>>(((i/PtrSize)%4)*BitsPerPointer))&BitsMask;
+
+			if(bits <= BitsScalar) // BitsScalar || BitsDead
+				continue;
+			if(bits == BitsPointer) {
+				obj = *(byte**)(b+i);
+				obj0 = obj;
+				goto markobj;
+			}
+
+			// With those three out of the way, must be multi-word.
+			if(Debug && bits != BitsMultiWord)
+				runtime·throw("unexpected garbage collection bits");
+			// Find the next pair of bits.
+			if(ptrmask == nil) {
+				bits = *ptrbitp;
+				j = ((uintptr)b+i+PtrSize)/PtrSize & 1;
+				ptrbitp -= j;
+				bits >>= gcBits*j;
+				bits = (bits>>2)&BitsMask;
+			} else
+				bits = (ptrmask[((i+PtrSize)/PtrSize)/4]>>((((i+PtrSize)/PtrSize)%4)*BitsPerPointer))&BitsMask;
+
+			if(Debug && bits != BitsIface && bits != BitsEface)
+				runtime·throw("unexpected garbage collection bits");
+
+			if(bits == BitsIface) {
+				iface = (Iface*)(b+i);
+				if(iface->tab != nil) {
+					typ = iface->tab->type;
+					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
+						obj = iface->data;
+				}
+			} else {
+				eface = (Eface*)(b+i);
+				typ = eface->type;
+				if(typ != nil) {
+					if(!(typ->kind&KindDirectIface) || !(typ->kind&KindNoPointers))
+						obj = eface->data;
+				}
+			}
+
+			i += PtrSize;
+
+			obj0 = obj;
+		markobj:
+			// At this point we have extracted the next potential pointer.
+			// Check if it points into heap.
+			if(obj == nil)
+				continue;
+			if(obj < arena_start || obj >= arena_used) {
+				if((uintptr)obj < PhysPageSize && runtime·invalidptr) {
+					s = nil;
+					goto badobj;
+				}
+				continue;
+			}
+			// Mark the object.
+			obj = (byte*)((uintptr)obj & ~(PtrSize-1));
+			off = (uintptr*)obj - (uintptr*)arena_start;
+			bitp = arena_start - off/wordsPerBitmapByte - 1;
+			shift = (off % wordsPerBitmapByte) * gcBits;
+			xbits = *bitp;
+			bits = (xbits >> shift) & bitMask;
+			if((bits&bitBoundary) == 0) {
+				// Not a beginning of a block, consult span table to find the block beginning.
+				k = (uintptr)obj>>PageShift;
+				x = k;
+				x -= (uintptr)arena_start>>PageShift;
+				s = runtime·mheap.spans[x];
+				if(s == nil || k < s->start || obj >= s->limit || s->state != MSpanInUse) {
+					// Stack pointers lie within the arena bounds but are not part of the GC heap.
+					// Ignore them.
+					if(s != nil && s->state == MSpanStack)
+						continue;
+				
+				badobj:
+					// If cgo_allocate is linked into the binary, it can allocate
+					// memory as []unsafe.Pointer that may not contain actual
+					// pointers and must be scanned conservatively.
+					// In this case alone, allow the bad pointer.
+					if(have_cgo_allocate() && ptrmask == nil)
+						continue;
+
+					// Anything else indicates a bug somewhere.
+					// If we're in the middle of chasing down a different bad pointer,
+					// don't confuse the trace by printing about this one.
+					if(nbadblock > 0)
+						continue;
+
+					runtime·printf("runtime: garbage collector found invalid heap pointer *(%p+%p)=%p", b, i, obj);
+					if(s == nil)
+						runtime·printf(" s=nil\n");
+					else
+						runtime·printf(" span=%p-%p-%p state=%d\n", (uintptr)s->start<<PageShift, s->limit, (uintptr)(s->start+s->npages)<<PageShift, s->state);
+					if(ptrmask != nil)
+						runtime·throw("invalid heap pointer");
+					// Add to badblock list, which will cause the garbage collection
+					// to keep repeating until it has traced the chain of pointers
+					// leading to obj all the way back to a root.
+					if(nbadblock == 0)
+						badblock[nbadblock++] = (uintptr)b;
+					continue;
+				}
+				p = (byte*)((uintptr)s->start<<PageShift);
+				if(s->sizeclass != 0) {
+					size = s->elemsize;
+					idx = ((byte*)obj - p)/size;
+					p = p+idx*size;
+				}
+				if(p == obj) {
+					runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
+						p, s->start*PageSize, s->limit);
+					runtime·throw("failed to find block beginning");
+				}
+				obj = p;
+				goto markobj;
+			}
+			if(DebugPtrs)
+				runtime·printf("scan *%p = %p => base %p\n", b+i, obj0, obj);
+
+			if(nbadblock > 0 && (uintptr)obj == badblock[nbadblock-1]) {
+				// Running garbage collection again because
+				// we want to find the path from a root to a bad pointer.
+				// Found possible next step; extend or finish path.
+				for(j=0; j<nbadblock; j++)
+					if(badblock[j] == (uintptr)b)
+						goto AlreadyBad;
+				runtime·printf("runtime: found *(%p+%p) = %p+%p\n", b, i, obj0, (uintptr)(obj-obj0));
+				if(ptrmask != nil)
+					runtime·throw("bad pointer");
+				if(nbadblock >= nelem(badblock))
+					runtime·throw("badblock trace too long");
+				badblock[nbadblock++] = (uintptr)b;
+			AlreadyBad:;
+			}
+
+			// Now we have bits, bitp, and shift correct for
+			// obj pointing at the base of the object.
+			// Only care about not marked objects.
+			if((bits&bitMarked) != 0)
+				continue;
+			// If obj size is greater than 8, then each byte of GC bitmap
+			// contains info for at most one object. In such case we use
+			// non-atomic byte store to mark the object. This can lead
+			// to double enqueue of the object for scanning, but scanning
+			// is an idempotent operation, so it is OK. This cannot lead
+			// to bitmap corruption because the single marked bit is the
+			// only thing that can change in the byte.
+			// For 8-byte objects we use non-atomic store, if the other
+			// quadruple is already marked. Otherwise we resort to CAS
+			// loop for marking.
+			if((xbits&(bitMask|(bitMask<<gcBits))) != (bitBoundary|(bitBoundary<<gcBits)) ||
+				runtime·work.nproc == 1)
+				*bitp = xbits | (bitMarked<<shift);
+			else
+				runtime·atomicor8(bitp, bitMarked<<shift);
+
+			if(((xbits>>(shift+2))&BitsMask) == BitsDead)
+				continue;  // noscan object
+
+			// Queue the obj for scanning.
+			PREFETCH(obj);
+			p = scanbuf[scanbufpos];
+			scanbuf[scanbufpos++] = obj;
+			scanbufpos %= nelem(scanbuf);
+			if(p == nil)
+				continue;
+
+			// If workbuf is full, obtain an empty one.
+			if(nobj >= nelem(wbuf->obj)) {
+				wbuf->nobj = nobj;
+				wbuf = getempty(wbuf);
+				nobj = wbuf->nobj;
+				wp = &wbuf->obj[nobj];
+			}
+			*wp = p;
+			wp++;
+			nobj++;
+		}
+		if(DebugPtrs)
+			runtime·printf("end scanblock %p +%p %p\n", b, n, ptrmask);
+
+		if(Debug && ptrmask == nil) {
+			// For heap objects ensure that we did not overscan.
+			n = 0;
+			p = nil;
+			if(!runtime·mlookup(b, &p, &n, nil) || b != p || i > n) {
+				runtime·printf("runtime: scanned (%p,%p), heap object (%p,%p)\n", b, i, p, n);
+				runtime·throw("scanblock: scanned invalid object");
+			}
+		}
+	}
+}
+
+static void
+markroot(ParFor *desc, uint32 i)
+{
+	FinBlock *fb;
+	MSpan *s;
+	uint32 spanidx, sg;
+	G *gp;
+	void *p;
+	uint32 status;
+	bool restart;
+
+	USED(&desc);
+	// Note: if you add a case here, please also update heapdump.c:dumproots.
+	switch(i) {
+	case RootData:
+		scanblock(runtime·data, runtime·edata - runtime·data, runtime·gcdatamask.bytedata);
+		break;
+
+	case RootBss:
+		scanblock(runtime·bss, runtime·ebss - runtime·bss, runtime·gcbssmask.bytedata);
+		break;
+
+	case RootFinalizers:
+		for(fb=runtime·allfin; fb; fb=fb->alllink)
+			scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), finptrmask);
+		break;
+
+	case RootSpans:
+		// mark MSpan.specials
+		sg = runtime·mheap.sweepgen;
+		for(spanidx=0; spanidx<runtime·work.nspan; spanidx++) {
+			Special *sp;
+			SpecialFinalizer *spf;
+
+			s = runtime·work.spans[spanidx];
+			if(s->state != MSpanInUse)
+				continue;
+			if(s->sweepgen != sg) {
+				runtime·printf("sweep %d %d\n", s->sweepgen, sg);
+				runtime·throw("gc: unswept span");
+			}
+			for(sp = s->specials; sp != nil; sp = sp->next) {
+				if(sp->kind != KindSpecialFinalizer)
+					continue;
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf = (SpecialFinalizer*)sp;
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p = (void*)((s->start << PageShift) + spf->special.offset/s->elemsize*s->elemsize);
+				scanblock(p, s->elemsize, nil);
+				scanblock((void*)&spf->fn, PtrSize, oneptr);
+			}
+		}
+		break;
+
+	case RootFlushCaches:
+		flushallmcaches();
+		break;
+
+	default:
+		// the rest is scanning goroutine stacks
+		if(i - RootCount >= runtime·allglen)
+			runtime·throw("markroot: bad index");
+		gp = runtime·allg[i - RootCount];
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		status = runtime·readgstatus(gp);
+		if((status == Gwaiting || status == Gsyscall) && gp->waitsince == 0)
+			gp->waitsince = runtime·work.tstart;
+		// Shrink a stack if not much of it is being used.
+		runtime·shrinkstack(gp);
+		if(runtime·readgstatus(gp) == Gdead) 
+			gp->gcworkdone = true;
+		else 
+			gp->gcworkdone = false; 
+		restart = runtime·stopg(gp);
+		scanstack(gp);
+		if(restart)
+			runtime·restartg(gp);
+		break;
+	}
+}
+
+// Get an empty work buffer off the work.empty list,
+// allocating new buffers as needed.
+static Workbuf*
+getempty(Workbuf *b)
+{
+	MCache *c;
+
+	if(b != nil)
+		runtime·lfstackpush(&runtime·work.full, &b->node);
+	b = nil;
+	c = g->m->mcache;
+	if(c->gcworkbuf != nil) {
+		b = c->gcworkbuf;
+		c->gcworkbuf = nil;
+	}
+	if(b == nil)
+		b = (Workbuf*)runtime·lfstackpop(&runtime·work.empty);
+	if(b == nil)
+		b = runtime·persistentalloc(sizeof(*b), CacheLineSize, &mstats.gc_sys);
+	b->nobj = 0;
+	return b;
+}
+
+static void
+putempty(Workbuf *b)
+{
+	MCache *c;
+
+	c = g->m->mcache;
+	if(c->gcworkbuf == nil) {
+		c->gcworkbuf = b;
+		return;
+	}
+	runtime·lfstackpush(&runtime·work.empty, &b->node);
+}
+
+void
+runtime·gcworkbuffree(void *b)
+{
+	if(b != nil)
+		putempty(b);
+}
+
+// Get a full work buffer off the work.full list, or return nil.
+static Workbuf*
+getfull(Workbuf *b)
+{
+	int32 i;
+
+	if(b != nil)
+		runtime·lfstackpush(&runtime·work.empty, &b->node);
+	b = (Workbuf*)runtime·lfstackpop(&runtime·work.full);
+	if(b != nil || runtime·work.nproc == 1)
+		return b;
+
+	runtime·xadd(&runtime·work.nwait, +1);
+	for(i=0;; i++) {
+		if(runtime·work.full != 0) {
+			runtime·xadd(&runtime·work.nwait, -1);
+			b = (Workbuf*)runtime·lfstackpop(&runtime·work.full);
+			if(b != nil)
+				return b;
+			runtime·xadd(&runtime·work.nwait, +1);
+		}
+		if(runtime·work.nwait == runtime·work.nproc)
+			return nil;
+		if(i < 10) {
+			g->m->gcstats.nprocyield++;
+			runtime·procyield(20);
+		} else if(i < 20) {
+			g->m->gcstats.nosyield++;
+			runtime·osyield();
+		} else {
+			g->m->gcstats.nsleep++;
+			runtime·usleep(100);
+		}
+	}
+}
+
+static Workbuf*
+handoff(Workbuf *b)
+{
+	int32 n;
+	Workbuf *b1;
+
+	// Make new buffer with half of b's pointers.
+	b1 = getempty(nil);
+	n = b->nobj/2;
+	b->nobj -= n;
+	b1->nobj = n;
+	runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
+	g->m->gcstats.nhandoff++;
+	g->m->gcstats.nhandoffcnt += n;
+
+	// Put b on full list - let first half of b get stolen.
+	runtime·lfstackpush(&runtime·work.full, &b->node);
+	return b1;
+}
+
+BitVector
+runtime·stackmapdata(StackMap *stackmap, int32 n)
+{
+	if(n < 0 || n >= stackmap->n)
+		runtime·throw("stackmapdata: index out of range");
+	return (BitVector){stackmap->nbit, stackmap->bytedata + n*((stackmap->nbit+31)/32*4)};
+}
+
+// Scan a stack frame: local variables and function arguments/results.
+static bool
+scanframe(Stkframe *frame, void *unused)
+{
+	Func *f;
+	StackMap *stackmap;
+	BitVector bv;
+	uintptr size, minsize;
+	uintptr targetpc;
+	int32 pcdata;
+
+	USED(unused);
+	f = frame->fn;
+	targetpc = frame->continpc;
+	if(targetpc == 0) {
+		// Frame is dead.
+		return true;
+	}
+	if(Debug > 1)
+		runtime·printf("scanframe %s\n", runtime·funcname(f));
+	if(targetpc != f->entry)
+		targetpc--;
+	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
+	if(pcdata == -1) {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function.  It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0;
+	}
+
+	// Scan local variables if stack frame has been allocated.
+	size = frame->varp - frame->sp;
+	if(thechar != '6' && thechar != '8')
+		minsize = sizeof(uintptr);
+	else
+		minsize = 0;
+	if(size > minsize) {
+		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
+		if(stackmap == nil || stackmap->n <= 0) {
+			runtime·printf("runtime: frame %s untyped locals %p+%p\n", runtime·funcname(f), (byte*)(frame->varp-size), size);
+			runtime·throw("missing stackmap");
+		}
+
+		// Locals bitmap information, scan just the pointers in locals.
+		if(pcdata < 0 || pcdata >= stackmap->n) {
+			// don't know where we are
+			runtime·printf("runtime: pcdata is %d and %d locals stack map entries for %s (targetpc=%p)\n",
+				pcdata, stackmap->n, runtime·funcname(f), targetpc);
+			runtime·throw("scanframe: bad symbol table");
+		}
+		bv = runtime·stackmapdata(stackmap, pcdata);
+		size = (bv.n * PtrSize) / BitsPerPointer;
+		scanblock((byte*)(frame->varp - size), bv.n/BitsPerPointer*PtrSize, bv.bytedata);
+	}
+
+	// Scan arguments.
+	if(frame->arglen > 0) {
+		if(frame->argmap != nil)
+			bv = *frame->argmap;
+		else {
+			stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
+			if(stackmap == nil || stackmap->n <= 0) {
+				runtime·printf("runtime: frame %s untyped args %p+%p\n", runtime·funcname(f), frame->argp, (uintptr)frame->arglen);
+				runtime·throw("missing stackmap");
+			}
+			if(pcdata < 0 || pcdata >= stackmap->n) {
+				// don't know where we are
+				runtime·printf("runtime: pcdata is %d and %d args stack map entries for %s (targetpc=%p)\n",
+					pcdata, stackmap->n, runtime·funcname(f), targetpc);
+				runtime·throw("scanframe: bad symbol table");
+			}
+ 			bv = runtime·stackmapdata(stackmap, pcdata);
+		}
+ 		scanblock((byte*)frame->argp, bv.n/BitsPerPointer*PtrSize, bv.bytedata);
+ 	}
+ 	return true;
+}
+
+static void
+scanstack(G *gp)
+{
+	M *mp;
+	bool (*fn)(Stkframe*, void*);
+
+	if(runtime·readgstatus(gp)&Gscan == 0) {
+		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
+		runtime·throw("mark - bad status");
+	}
+
+	switch(runtime·readgstatus(gp)&~Gscan) {
+	default:
+		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
+		runtime·throw("mark - bad status");
+	case Gdead:
+		return;
+	case Grunning:
+		runtime·printf("runtime: gp=%p, goid=%D, gp->atomicstatus=%d\n", gp, gp->goid, runtime·readgstatus(gp));
+		runtime·throw("mark - world not stopped");
+	case Grunnable:
+	case Gsyscall:
+	case Gwaiting:
+		break;
+	}
+
+	if(gp == g)
+		runtime·throw("can't scan our own stack");
+	if((mp = gp->m) != nil && mp->helpgc)
+		runtime·throw("can't scan gchelper stack");
+
+	fn = scanframe;
+	runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, &fn, nil, 0);
+	runtime·tracebackdefers(gp, &fn, nil);
+}
+
+// The gp has been moved to a gc safepoint. If there is gcphase specific
+// work it is done here. 
+void
+runtime·gcphasework(G *gp)
+{
+	switch(runtime·gcphase) {
+	default:
+		runtime·throw("gcphasework in bad gcphase");
+	case GCoff:
+	case GCquiesce:
+	case GCstw:
+	case GCsweep:
+		// No work for now.
+		break;
+	case GCmark:
+		// Disabled until concurrent GC is implemented
+		// but indicate the scan has been done. 
+		// scanstack(gp);
+		break;
+	}
+	gp->gcworkdone = true;
+}
+
+#pragma dataflag NOPTR
+static byte finalizer1[] = {
+	// Each Finalizer is 5 words, ptr ptr uintptr ptr ptr.
+	// Each byte describes 4 words.
+	// Need 4 Finalizers described by 5 bytes before pattern repeats:
+	//	ptr ptr uintptr ptr ptr
+	//	ptr ptr uintptr ptr ptr
+	//	ptr ptr uintptr ptr ptr
+	//	ptr ptr uintptr ptr ptr
+	// aka
+	//	ptr ptr uintptr ptr
+	//	ptr ptr ptr uintptr
+	//	ptr ptr ptr ptr
+	//	uintptr ptr ptr ptr
+	//	ptr uintptr ptr ptr
+	// Assumptions about Finalizer layout checked below.
+	BitsPointer | BitsPointer<<2 | BitsScalar<<4 | BitsPointer<<6,
+	BitsPointer | BitsPointer<<2 | BitsPointer<<4 | BitsScalar<<6,
+	BitsPointer | BitsPointer<<2 | BitsPointer<<4 | BitsPointer<<6,
+	BitsScalar | BitsPointer<<2 | BitsPointer<<4 | BitsPointer<<6,
+	BitsPointer | BitsScalar<<2 | BitsPointer<<4 | BitsPointer<<6,
+};
+
+void
+runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, PtrType *ot)
+{
+	FinBlock *block;
+	Finalizer *f;
+	int32 i;
+
+	runtime·lock(&runtime·finlock);
+	if(runtime·finq == nil || runtime·finq->cnt == runtime·finq->cap) {
+		if(runtime·finc == nil) {
+			runtime·finc = runtime·persistentalloc(FinBlockSize, 0, &mstats.gc_sys);
+			runtime·finc->cap = (FinBlockSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
+			runtime·finc->alllink = runtime·allfin;
+			runtime·allfin = runtime·finc;
+			if(finptrmask[0] == 0) {
+				// Build pointer mask for Finalizer array in block.
+				// Check assumptions made in finalizer1 array above.
+				if(sizeof(Finalizer) != 5*PtrSize ||
+					offsetof(Finalizer, fn) != 0 ||
+					offsetof(Finalizer, arg) != PtrSize ||
+					offsetof(Finalizer, nret) != 2*PtrSize ||
+					offsetof(Finalizer, fint) != 3*PtrSize ||
+					offsetof(Finalizer, ot) != 4*PtrSize ||
+					BitsPerPointer != 2) {
+					runtime·throw("finalizer out of sync");
+				}
+				for(i=0; i<nelem(finptrmask); i++)
+					finptrmask[i] = finalizer1[i%nelem(finalizer1)];
+			}
+		}
+		block = runtime·finc;
+		runtime·finc = block->next;
+		block->next = runtime·finq;
+		runtime·finq = block;
+	}
+	f = &runtime·finq->fin[runtime·finq->cnt];
+	runtime·finq->cnt++;
+	f->fn = fn;
+	f->nret = nret;
+	f->fint = fint;
+	f->ot = ot;
+	f->arg = p;
+	runtime·fingwake = true;
+	runtime·unlock(&runtime·finlock);
+}
+
+void
+runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType*))
+{
+	FinBlock *fb;
+	Finalizer *f;
+	uintptr i;
+
+	for(fb = runtime·allfin; fb; fb = fb->alllink) {
+		for(i = 0; i < fb->cnt; i++) {
+			f = &fb->fin[i];
+			callback(f->fn, f->arg, f->nret, f->fint, f->ot);
+		}
+	}
+}
+
+void
+runtime·MSpan_EnsureSwept(MSpan *s)
+{
+	uint32 sg;
+
+	// Caller must disable preemption.
+	// Otherwise when this function returns the span can become unswept again
+	// (if GC is triggered on another goroutine).
+	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
+		runtime·throw("MSpan_EnsureSwept: m is not locked");
+
+	sg = runtime·mheap.sweepgen;
+	if(runtime·atomicload(&s->sweepgen) == sg)
+		return;
+	if(runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+		runtime·MSpan_Sweep(s, false);
+		return;
+	}
+	// unfortunate condition, and we don't have efficient means to wait
+	while(runtime·atomicload(&s->sweepgen) != sg)
+		runtime·osyield();
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in MCentral lists;
+// caller takes care of it.
+bool
+runtime·MSpan_Sweep(MSpan *s, bool preserve)
+{
+	int32 cl, n, npages, nfree;
+	uintptr size, off, step;
+	uint32 sweepgen;
+	byte *p, *bitp, shift, xbits, bits;
+	MCache *c;
+	byte *arena_start;
+	MLink head, *end, *link;
+	Special *special, **specialp, *y;
+	bool res, sweepgenset;
+
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	if(g->m->locks == 0 && g->m->mallocing == 0 && g != g->m->g0)
+		runtime·throw("MSpan_Sweep: m is not locked");
+	sweepgen = runtime·mheap.sweepgen;
+	if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
+		runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
+			s->state, s->sweepgen, sweepgen);
+		runtime·throw("MSpan_Sweep: bad span state");
+	}
+	arena_start = runtime·mheap.arena_start;
+	cl = s->sizeclass;
+	size = s->elemsize;
+	if(cl == 0) {
+		n = 1;
+	} else {
+		// Chunk full of small blocks.
+		npages = runtime·class_to_allocnpages[cl];
+		n = (npages << PageShift) / size;
+	}
+	res = false;
+	nfree = 0;
+	end = &head;
+	c = g->m->mcache;
+	sweepgenset = false;
+
+	// Mark any free objects in this span so we don't collect them.
+	for(link = s->freelist; link != nil; link = link->next) {
+		off = (uintptr*)link - (uintptr*)arena_start;
+		bitp = arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
+		*bitp |= bitMarked<<shift;
+	}
+
+	// Unlink & free special records for any objects we're about to free.
+	specialp = &s->specials;
+	special = *specialp;
+	while(special != nil) {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		p = (byte*)(s->start << PageShift) + special->offset/size*size;
+		off = (uintptr*)p - (uintptr*)arena_start;
+		bitp = arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
+		bits = (*bitp>>shift) & bitMask;
+		if((bits&bitMarked) == 0) {
+			// Find the exact byte for which the special was setup
+			// (as opposed to object beginning).
+			p = (byte*)(s->start << PageShift) + special->offset;
+			// about to free object: splice out special record
+			y = special;
+			special = special->next;
+			*specialp = special;
+			if(!runtime·freespecial(y, p, size, false)) {
+				// stop freeing of object if it has a finalizer
+				*bitp |= bitMarked << shift;
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special->next;
+			special = *specialp;
+		}
+	}
+
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	p = (byte*)(s->start << PageShift);
+	// Find bits for the beginning of the span.
+	off = (uintptr*)p - (uintptr*)arena_start;
+	bitp = arena_start - off/wordsPerBitmapByte - 1;
+	shift = 0;
+	step = size/(PtrSize*wordsPerBitmapByte);
+	// Rewind to the previous quadruple as we move to the next
+	// in the beginning of the loop.
+	bitp += step;
+	if(step == 0) {
+		// 8-byte objects.
+		bitp++;
+		shift = gcBits;
+	}
+	for(; n > 0; n--, p += size) {
+		bitp -= step;
+		if(step == 0) {
+			if(shift != 0)
+				bitp--;
+			shift = gcBits - shift;
+		}
+
+		xbits = *bitp;
+		bits = (xbits>>shift) & bitMask;
+
+		// Allocated and marked object, reset bits to allocated.
+		if((bits&bitMarked) != 0) {
+			*bitp &= ~(bitMarked<<shift);
+			continue;
+		}
+		// At this point we know that we are looking at garbage object
+		// that needs to be collected.
+		if(runtime·debug.allocfreetrace)
+			runtime·tracefree(p, size);
+		// Reset to allocated+noscan.
+		*bitp = (xbits & ~((bitMarked|(BitsMask<<2))<<shift)) | ((uintptr)BitsDead<<(shift+2));
+		if(cl == 0) {
+			// Free large span.
+			if(preserve)
+				runtime·throw("can't preserve large span");
+			runtime·unmarkspan(p, s->npages<<PageShift);
+			s->needzero = 1;
+			// important to set sweepgen before returning it to heap
+			runtime·atomicstore(&s->sweepgen, sweepgen);
+			sweepgenset = true;
+			// NOTE(rsc,dvyukov): The original implementation of efence
+			// in CL 22060046 used SysFree instead of SysFault, so that
+			// the operating system would eventually give the memory
+			// back to us again, so that an efence program could run
+			// longer without running out of memory. Unfortunately,
+			// calling SysFree here without any kind of adjustment of the
+			// heap data structures means that when the memory does
+			// come back to us, we have the wrong metadata for it, either in
+			// the MSpan structures or in the garbage collection bitmap.
+			// Using SysFault here means that the program will run out of
+			// memory fairly quickly in efence mode, but at least it won't
+			// have mysterious crashes due to confused memory reuse.
+			// It should be possible to switch back to SysFree if we also
+			// implement and then call some kind of MHeap_DeleteSpan.
+			if(runtime·debug.efence) {
+				s->limit = nil;	// prevent mlookup from finding this span
+				runtime·SysFault(p, size);
+			} else
+				runtime·MHeap_Free(&runtime·mheap, s, 1);
+			c->local_nlargefree++;
+			c->local_largefree += size;
+			runtime·xadd64(&mstats.next_gc, -(uint64)(size * (runtime·gcpercent + 100)/100));
+			res = true;
+		} else {
+			// Free small object.
+			if(size > 2*sizeof(uintptr))
+				((uintptr*)p)[1] = (uintptr)0xdeaddeaddeaddeadll;	// mark as "needs to be zeroed"
+			else if(size > sizeof(uintptr))
+				((uintptr*)p)[1] = 0;
+
+			end->next = (MLink*)p;
+			end = (MLink*)p;
+			nfree++;
+		}
+	}
+
+	// We need to set s->sweepgen = h->sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+
+	if(!sweepgenset && nfree == 0) {
+		// The span must be in our exclusive ownership until we update sweepgen,
+		// check for potential races.
+		if(s->state != MSpanInUse || s->sweepgen != sweepgen-1) {
+			runtime·printf("MSpan_Sweep: state=%d sweepgen=%d mheap.sweepgen=%d\n",
+				s->state, s->sweepgen, sweepgen);
+			runtime·throw("MSpan_Sweep: bad span state after sweep");
+		}
+		runtime·atomicstore(&s->sweepgen, sweepgen);
+	}
+	if(nfree > 0) {
+		c->local_nsmallfree[cl] += nfree;
+		c->local_cachealloc -= nfree * size;
+		runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (runtime·gcpercent + 100)/100));
+		res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl].mcentral, s, nfree, head.next, end, preserve);
+		// MCentral_FreeSpan updates sweepgen
+	}
+	return res;
+}
+
+// State of background runtime·sweep.
+// Protected by runtime·gclock.
+typedef struct SweepData SweepData;
+struct SweepData
+{
+	G*	g;
+	bool	parked;
+
+	uint32	spanidx;	// background sweeper position
+
+	uint32	nbgsweep;
+	uint32	npausesweep;
+};
+SweepData runtime·sweep;
+
+// sweeps one span
+// returns number of pages returned to heap, or -1 if there is nothing to sweep
+uintptr
+runtime·sweepone(void)
+{
+	MSpan *s;
+	uint32 idx, sg;
+	uintptr npages;
+
+	// increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	g->m->locks++;
+	sg = runtime·mheap.sweepgen;
+	for(;;) {
+		idx = runtime·xadd(&runtime·sweep.spanidx, 1) - 1;
+		if(idx >= runtime·work.nspan) {
+			runtime·mheap.sweepdone = true;
+			g->m->locks--;
+			return -1;
+		}
+		s = runtime·work.spans[idx];
+		if(s->state != MSpanInUse) {
+			s->sweepgen = sg;
+			continue;
+		}
+		if(s->sweepgen != sg-2 || !runtime·cas(&s->sweepgen, sg-2, sg-1))
+			continue;
+		npages = s->npages;
+		if(!runtime·MSpan_Sweep(s, false))
+			npages = 0;
+		g->m->locks--;
+		return npages;
+	}
+}
+
+static void
+sweepone_m(void)
+{
+	g->m->scalararg[0] = runtime·sweepone();
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·gosweepone(void)
+{
+	void (*fn)(void);
+	
+	fn = sweepone_m;
+	runtime·onM(&fn);
+	return g->m->scalararg[0];
+}
+
+#pragma textflag NOSPLIT
+bool
+runtime·gosweepdone(void)
+{
+	return runtime·mheap.sweepdone;
+}
+
+void
+runtime·gchelper(void)
+{
+	uint32 nproc;
+
+	g->m->traceback = 2;
+	gchelperstart();
+
+	// parallel mark for over gc roots
+	runtime·parfordo(runtime·work.markfor);
+
+	// help other threads scan secondary blocks
+	scanblock(nil, 0, nil);
+
+	nproc = runtime·work.nproc;  // runtime·work.nproc can change right after we increment runtime·work.ndone
+	if(runtime·xadd(&runtime·work.ndone, +1) == nproc-1)
+		runtime·notewakeup(&runtime·work.alldone);
+	g->m->traceback = 0;
+}
+
+static void
+cachestats(void)
+{
+	MCache *c;
+	P *p, **pp;
+
+	for(pp=runtime·allp; p=*pp; pp++) {
+		c = p->mcache;
+		if(c==nil)
+			continue;
+		runtime·purgecachedstats(c);
+	}
+}
+
+static void
+flushallmcaches(void)
+{
+	P *p, **pp;
+	MCache *c;
+
+	// Flush MCache's to MCentral.
+	for(pp=runtime·allp; p=*pp; pp++) {
+		c = p->mcache;
+		if(c==nil)
+			continue;
+		runtime·MCache_ReleaseAll(c);
+		runtime·stackcache_clear(c);
+	}
+}
+
+static void
+flushallmcaches_m(G *gp)
+{
+	flushallmcaches();
+	runtime·gogo(&gp->sched);
+}
+
+void
+runtime·updatememstats(GCStats *stats)
+{
+	M *mp;
+	MSpan *s;
+	int32 i;
+	uint64 smallfree;
+	uint64 *src, *dst;
+	void (*fn)(G*);
+
+	if(stats)
+		runtime·memclr((byte*)stats, sizeof(*stats));
+	for(mp=runtime·allm; mp; mp=mp->alllink) {
+		if(stats) {
+			src = (uint64*)&mp->gcstats;
+			dst = (uint64*)stats;
+			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
+				dst[i] += src[i];
+			runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
+		}
+	}
+	mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
+	mstats.mspan_inuse = runtime·mheap.spanalloc.inuse;
+	mstats.sys = mstats.heap_sys + mstats.stacks_sys + mstats.mspan_sys +
+		mstats.mcache_sys + mstats.buckhash_sys + mstats.gc_sys + mstats.other_sys;
+	
+	// Calculate memory allocator stats.
+	// During program execution we only count number of frees and amount of freed memory.
+	// Current number of alive object in the heap and amount of alive heap memory
+	// are calculated by scanning all spans.
+	// Total number of mallocs is calculated as number of frees plus number of alive objects.
+	// Similarly, total amount of allocated memory is calculated as amount of freed memory
+	// plus amount of alive heap memory.
+	mstats.alloc = 0;
+	mstats.total_alloc = 0;
+	mstats.nmalloc = 0;
+	mstats.nfree = 0;
+	for(i = 0; i < nelem(mstats.by_size); i++) {
+		mstats.by_size[i].nmalloc = 0;
+		mstats.by_size[i].nfree = 0;
+	}
+
+	// Flush MCache's to MCentral.
+	if(g == g->m->g0)
+		flushallmcaches();
+	else {
+		fn = flushallmcaches_m;
+		runtime·mcall(&fn);
+	}
+
+	// Aggregate local stats.
+	cachestats();
+
+	// Scan all spans and count number of alive objects.
+	runtime·lock(&runtime·mheap.lock);
+	for(i = 0; i < runtime·mheap.nspan; i++) {
+		s = runtime·mheap.allspans[i];
+		if(s->state != MSpanInUse)
+			continue;
+		if(s->sizeclass == 0) {
+			mstats.nmalloc++;
+			mstats.alloc += s->elemsize;
+		} else {
+			mstats.nmalloc += s->ref;
+			mstats.by_size[s->sizeclass].nmalloc += s->ref;
+			mstats.alloc += s->ref*s->elemsize;
+		}
+	}
+	runtime·unlock(&runtime·mheap.lock);
+
+	// Aggregate by size class.
+	smallfree = 0;
+	mstats.nfree = runtime·mheap.nlargefree;
+	for(i = 0; i < nelem(mstats.by_size); i++) {
+		mstats.nfree += runtime·mheap.nsmallfree[i];
+		mstats.by_size[i].nfree = runtime·mheap.nsmallfree[i];
+		mstats.by_size[i].nmalloc += runtime·mheap.nsmallfree[i];
+		smallfree += runtime·mheap.nsmallfree[i] * runtime·class_to_size[i];
+	}
+	mstats.nfree += mstats.tinyallocs;
+	mstats.nmalloc += mstats.nfree;
+
+	// Calculate derived stats.
+	mstats.total_alloc = mstats.alloc + runtime·mheap.largefree + smallfree;
+	mstats.heap_alloc = mstats.alloc;
+	mstats.heap_objects = mstats.nmalloc - mstats.nfree;
+}
+
+// Structure of arguments passed to function gc().
+// This allows the arguments to be passed via runtime·mcall.
+struct gc_args
+{
+	int64 start_time; // start time of GC in ns (just before stoptheworld)
+	bool  eagersweep;
+};
+
+static void gc(struct gc_args *args);
+
+int32
+runtime·readgogc(void)
+{
+	byte *p;
+
+	p = runtime·getenv("GOGC");
+	if(p == nil || p[0] == '\0')
+		return 100;
+	if(runtime·strcmp(p, (byte*)"off") == 0)
+		return -1;
+	return runtime·atoi(p);
+}
+
+void
+runtime·gcinit(void)
+{
+	if(sizeof(Workbuf) != WorkbufSize)
+		runtime·throw("runtime: size of Workbuf is suboptimal");
+
+	runtime·work.markfor = runtime·parforalloc(MaxGcproc);
+	runtime·gcpercent = runtime·readgogc();
+	runtime·gcdatamask = unrollglobgcprog(runtime·gcdata, runtime·edata - runtime·data);
+	runtime·gcbssmask = unrollglobgcprog(runtime·gcbss, runtime·ebss - runtime·bss);
+}
+
+void
+runtime·gc_m(void)
+{
+	struct gc_args a;
+	G *gp;
+
+	gp = g->m->curg;
+	runtime·casgstatus(gp, Grunning, Gwaiting);
+	gp->waitreason = runtime·gostringnocopy((byte*)"garbage collection");
+
+	a.start_time = (uint64)(g->m->scalararg[0]) | ((uint64)(g->m->scalararg[1]) << 32);
+	a.eagersweep = g->m->scalararg[2];
+	gc(&a);
+
+	if(nbadblock > 0) {
+		// Work out path from root to bad block.
+		for(;;) {
+			gc(&a);
+			if(nbadblock >= nelem(badblock))
+				runtime·throw("cannot find path to bad pointer");
+		}
+	}
+
+	runtime·casgstatus(gp, Gwaiting, Grunning);
+}
+
+static void
+gc(struct gc_args *args)
+{
+	int64 t0, t1, t2, t3, t4;
+	uint64 heap0, heap1, obj;
+	GCStats stats;
+
+	if(DebugPtrs)
+		runtime·printf("GC start\n");
+
+	if(runtime·debug.allocfreetrace)
+		runtime·tracegc();
+
+	g->m->traceback = 2;
+	t0 = args->start_time;
+	runtime·work.tstart = args->start_time; 
+
+	t1 = 0;
+	if(runtime·debug.gctrace)
+		t1 = runtime·nanotime();
+
+	// Sweep what is not sweeped by bgsweep.
+	while(runtime·sweepone() != -1)
+		runtime·sweep.npausesweep++;
+
+	// Cache runtime.mheap.allspans in work.spans to avoid conflicts with
+	// resizing/freeing allspans.
+	// New spans can be created while GC progresses, but they are not garbage for
+	// this round:
+	//  - new stack spans can be created even while the world is stopped.
+	//  - new malloc spans can be created during the concurrent sweep
+
+	// Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+	runtime·lock(&runtime·mheap.lock);
+	// Free the old cached sweep array if necessary.
+	if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans)
+		runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys);
+	// Cache the current array for marking.
+	runtime·mheap.gcspans = runtime·mheap.allspans;
+	runtime·work.spans = runtime·mheap.allspans;
+	runtime·work.nspan = runtime·mheap.nspan;
+	runtime·unlock(&runtime·mheap.lock);
+
+	runtime·work.nwait = 0;
+	runtime·work.ndone = 0;
+	runtime·work.nproc = runtime·gcprocs();
+	runtime·parforsetup(runtime·work.markfor, runtime·work.nproc, RootCount + runtime·allglen, nil, false, markroot);
+	if(runtime·work.nproc > 1) {
+		runtime·noteclear(&runtime·work.alldone);
+		runtime·helpgc(runtime·work.nproc);
+	}
+
+	t2 = 0;
+	if(runtime·debug.gctrace)
+		t2 = runtime·nanotime();
+
+	gchelperstart();
+	runtime·parfordo(runtime·work.markfor);
+	scanblock(nil, 0, nil);
+
+	t3 = 0;
+	if(runtime·debug.gctrace)
+		t3 = runtime·nanotime();
+
+	if(runtime·work.nproc > 1)
+		runtime·notesleep(&runtime·work.alldone);
+
+	runtime·shrinkfinish();
+
+	cachestats();
+	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
+	// estimate what was live heap size after previous GC (for tracing only)
+	heap0 = mstats.next_gc*100/(runtime·gcpercent+100);
+	// conservatively set next_gc to high value assuming that everything is live
+	// concurrent/lazy sweep will reduce this number while discovering new garbage
+	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*runtime·gcpercent/100;
+
+	t4 = runtime·nanotime();
+	runtime·atomicstore64(&mstats.last_gc, runtime·unixnanotime());  // must be Unix time to make sense to user
+	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
+	mstats.pause_end[mstats.numgc%nelem(mstats.pause_end)] = t4;
+	mstats.pause_total_ns += t4 - t0;
+	mstats.numgc++;
+	if(mstats.debuggc)
+		runtime·printf("pause %D\n", t4-t0);
+
+	if(runtime·debug.gctrace) {
+		heap1 = mstats.heap_alloc;
+		runtime·updatememstats(&stats);
+		if(heap1 != mstats.heap_alloc) {
+			runtime·printf("runtime: mstats skew: heap=%D/%D\n", heap1, mstats.heap_alloc);
+			runtime·throw("mstats skew");
+		}
+		obj = mstats.nmalloc - mstats.nfree;
+
+		stats.nprocyield += runtime·work.markfor->nprocyield;
+		stats.nosyield += runtime·work.markfor->nosyield;
+		stats.nsleep += runtime·work.markfor->nsleep;
+
+		runtime·printf("gc%d(%d): %D+%D+%D+%D us, %D -> %D MB, %D (%D-%D) objects,"
+				" %d goroutines,"
+				" %d/%d/%d sweeps,"
+				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
+			mstats.numgc, runtime·work.nproc, (t1-t0)/1000, (t2-t1)/1000, (t3-t2)/1000, (t4-t3)/1000,
+			heap0>>20, heap1>>20, obj,
+			mstats.nmalloc, mstats.nfree,
+			runtime·gcount(),
+			runtime·work.nspan, runtime·sweep.nbgsweep, runtime·sweep.npausesweep,
+			stats.nhandoff, stats.nhandoffcnt,
+			runtime·work.markfor->nsteal, runtime·work.markfor->nstealcnt,
+			stats.nprocyield, stats.nosyield, stats.nsleep);
+		runtime·sweep.nbgsweep = runtime·sweep.npausesweep = 0;
+	}
+
+	// See the comment in the beginning of this function as to why we need the following.
+	// Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+	runtime·lock(&runtime·mheap.lock);
+	// Free the old cached mark array if necessary.
+	if(runtime·work.spans != nil && runtime·work.spans != runtime·mheap.allspans)
+		runtime·SysFree(runtime·work.spans, runtime·work.nspan*sizeof(runtime·work.spans[0]), &mstats.other_sys);
+	// Cache the current array for sweeping.
+	runtime·mheap.gcspans = runtime·mheap.allspans;
+	runtime·mheap.sweepgen += 2;
+	runtime·mheap.sweepdone = false;
+	runtime·work.spans = runtime·mheap.allspans;
+	runtime·work.nspan = runtime·mheap.nspan;
+	runtime·sweep.spanidx = 0;
+	runtime·unlock(&runtime·mheap.lock);
+
+	if(ConcurrentSweep && !args->eagersweep) {
+		runtime·lock(&runtime·gclock);
+		if(runtime·sweep.g == nil)
+			runtime·sweep.g = runtime·newproc1(&bgsweepv, nil, 0, 0, gc);
+		else if(runtime·sweep.parked) {
+			runtime·sweep.parked = false;
+			runtime·ready(runtime·sweep.g);
+		}
+		runtime·unlock(&runtime·gclock);
+	} else {
+		// Sweep all spans eagerly.
+		while(runtime·sweepone() != -1)
+			runtime·sweep.npausesweep++;
+		// Do an additional mProf_GC, because all 'free' events are now real as well.
+		runtime·mProf_GC();
+	}
+
+	runtime·mProf_GC();
+	g->m->traceback = 0;
+
+	if(DebugPtrs)
+		runtime·printf("GC end\n");
+}
+
+extern uintptr runtime·sizeof_C_MStats;
+
+static void readmemstats_m(void);
+
+void
+runtime·readmemstats_m(void)
+{
+	MStats *stats;
+	
+	stats = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+
+	runtime·updatememstats(nil);
+	// Size of the trailing by_size array differs between Go and C,
+	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+	runtime·memmove(stats, &mstats, runtime·sizeof_C_MStats);
+
+	// Stack numbers are part of the heap numbers, separate those out for user consumption
+	stats->stacks_sys = stats->stacks_inuse;
+	stats->heap_inuse -= stats->stacks_inuse;
+	stats->heap_sys -= stats->stacks_inuse;
+}
+
+static void readgcstats_m(void);
+
+#pragma textflag NOSPLIT
+void
+runtime∕debug·readGCStats(Slice *pauses)
+{
+	void (*fn)(void);
+	
+	g->m->ptrarg[0] = pauses;
+	fn = readgcstats_m;
+	runtime·onM(&fn);
+}
+
+static void
+readgcstats_m(void)
+{
+	Slice *pauses;	
+	uint64 *p;
+	uint32 i, j, n;
+	
+	pauses = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+
+	// Calling code in runtime/debug should make the slice large enough.
+	if(pauses->cap < nelem(mstats.pause_ns)+3)
+		runtime·throw("runtime: short slice passed to readGCStats");
+
+	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
+	p = (uint64*)pauses->array;
+	runtime·lock(&runtime·mheap.lock);
+
+	n = mstats.numgc;
+	if(n > nelem(mstats.pause_ns))
+		n = nelem(mstats.pause_ns);
+
+	// The pause buffer is circular. The most recent pause is at
+	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
+	// from there to go back farther in time. We deliver the times
+	// most recent first (in p[0]).
+	for(i=0; i<n; i++) {
+		j = (mstats.numgc-1-i)%nelem(mstats.pause_ns);
+		p[i] = mstats.pause_ns[j];
+		p[n+i] = mstats.pause_end[j];
+	}
+
+	p[n+n] = mstats.last_gc;
+	p[n+n+1] = mstats.numgc;
+	p[n+n+2] = mstats.pause_total_ns;	
+	runtime·unlock(&runtime·mheap.lock);
+	pauses->len = n+n+3;
+}
+
+void
+runtime·setgcpercent_m(void)
+{
+	int32 in;
+	int32 out;
+
+	in = (int32)(intptr)g->m->scalararg[0];
+
+	runtime·lock(&runtime·mheap.lock);
+	out = runtime·gcpercent;
+	if(in < 0)
+		in = -1;
+	runtime·gcpercent = in;
+	runtime·unlock(&runtime·mheap.lock);
+
+	g->m->scalararg[0] = (uintptr)(intptr)out;
+}
+
+static void
+gchelperstart(void)
+{
+	if(g->m->helpgc < 0 || g->m->helpgc >= MaxGcproc)
+		runtime·throw("gchelperstart: bad m->helpgc");
+	if(g != g->m->g0)
+		runtime·throw("gchelper not running on g0 stack");
+}
+
+G*
+runtime·wakefing(void)
+{
+	G *res;
+
+	res = nil;
+	runtime·lock(&runtime·finlock);
+	if(runtime·fingwait && runtime·fingwake) {
+		runtime·fingwait = false;
+		runtime·fingwake = false;
+		res = runtime·fing;
+	}
+	runtime·unlock(&runtime·finlock);
+	return res;
+}
+
+// Recursively unrolls GC program in prog.
+// mask is where to store the result.
+// ppos is a pointer to position in mask, in bits.
+// sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
+static byte*
+unrollgcprog1(byte *mask, byte *prog, uintptr *ppos, bool inplace, bool sparse)
+{
+	uintptr pos, siz, i, off;
+	byte *arena_start, *prog1, v, *bitp, shift;
+
+	arena_start = runtime·mheap.arena_start;
+	pos = *ppos;
+	for(;;) {
+		switch(prog[0]) {
+		case insData:
+			prog++;
+			siz = prog[0];
+			prog++;
+			for(i = 0; i < siz; i++) {
+				v = prog[i/PointersPerByte];
+				v >>= (i%PointersPerByte)*BitsPerPointer;
+				v &= BitsMask;
+				if(inplace) {
+					// Store directly into GC bitmap.
+					off = (uintptr*)(mask+pos) - (uintptr*)arena_start;
+					bitp = arena_start - off/wordsPerBitmapByte - 1;
+					shift = (off % wordsPerBitmapByte) * gcBits;
+					if(shift==0)
+						*bitp = 0;
+					*bitp |= v<<(shift+2);
+					pos += PtrSize;
+				} else if(sparse) {
+					// 4-bits per word
+					v <<= (pos%8)+2;
+					mask[pos/8] |= v;
+					pos += gcBits;
+				} else {
+					// 2-bits per word
+					v <<= pos%8;
+					mask[pos/8] |= v;
+					pos += BitsPerPointer;
+				}
+			}
+			prog += ROUND(siz*BitsPerPointer, 8)/8;
+			break;
+		case insArray:
+			prog++;
+			siz = 0;
+			for(i = 0; i < PtrSize; i++)
+				siz = (siz<<8) + prog[PtrSize-i-1];
+			prog += PtrSize;
+			prog1 = nil;
+			for(i = 0; i < siz; i++)
+				prog1 = unrollgcprog1(mask, prog, &pos, inplace, sparse);
+			if(prog1[0] != insArrayEnd)
+				runtime·throw("unrollgcprog: array does not end with insArrayEnd");
+			prog = prog1+1;
+			break;
+		case insArrayEnd:
+		case insEnd:
+			*ppos = pos;
+			return prog;
+		default:
+			runtime·throw("unrollgcprog: unknown instruction");
+		}
+	}
+}
+
+// Unrolls GC program prog for data/bss, returns dense GC mask.
+static BitVector
+unrollglobgcprog(byte *prog, uintptr size)
+{
+	byte *mask;
+	uintptr pos, masksize;
+
+	masksize = ROUND(ROUND(size, PtrSize)/PtrSize*BitsPerPointer, 8)/8;
+	mask = runtime·persistentalloc(masksize+1, 0, &mstats.gc_sys);
+	mask[masksize] = 0xa1;
+	pos = 0;
+	prog = unrollgcprog1(mask, prog, &pos, false, false);
+	if(pos != size/PtrSize*BitsPerPointer) {
+		runtime·printf("unrollglobgcprog: bad program size, got %D, expect %D\n",
+			(uint64)pos, (uint64)size/PtrSize*BitsPerPointer);
+		runtime·throw("unrollglobgcprog: bad program size");
+	}
+	if(prog[0] != insEnd)
+		runtime·throw("unrollglobgcprog: program does not end with insEnd");
+	if(mask[masksize] != 0xa1)
+		runtime·throw("unrollglobgcprog: overflow");
+	return (BitVector){masksize*8, mask};
+}
+
+void
+runtime·unrollgcproginplace_m(void)
+{
+	uintptr size, size0, pos, off;
+	byte *arena_start, *prog, *bitp, shift;
+	Type *typ;
+	void *v;
+
+	v = g->m->ptrarg[0];
+	typ = g->m->ptrarg[1];
+	size = g->m->scalararg[0];
+	size0 = g->m->scalararg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+
+	pos = 0;
+	prog = (byte*)typ->gc[1];
+	while(pos != size0)
+		unrollgcprog1(v, prog, &pos, true, true);
+	// Mark first word as bitAllocated.
+	arena_start = runtime·mheap.arena_start;
+	off = (uintptr*)v - (uintptr*)arena_start;
+	bitp = arena_start - off/wordsPerBitmapByte - 1;
+	shift = (off % wordsPerBitmapByte) * gcBits;
+	*bitp |= bitBoundary<<shift;
+	// Mark word after last as BitsDead.
+	if(size0 < size) {
+		off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
+		bitp = arena_start - off/wordsPerBitmapByte - 1;
+		shift = (off % wordsPerBitmapByte) * gcBits;
+		*bitp &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
+	}
+}
+
+// Unrolls GC program in typ->gc[1] into typ->gc[0]
+void
+runtime·unrollgcprog_m(void)
+{
+	static Mutex lock;
+	Type *typ;
+	byte *mask, *prog;
+	uintptr pos;
+	uint32 x;
+
+	typ = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+
+	runtime·lock(&lock);
+	mask = (byte*)typ->gc[0];
+	if(mask[0] == 0) {
+		pos = 8;  // skip the unroll flag
+		prog = (byte*)typ->gc[1];
+		prog = unrollgcprog1(mask, prog, &pos, false, true);
+		if(prog[0] != insEnd)
+			runtime·throw("unrollgcprog: program does not end with insEnd");
+		if(((typ->size/PtrSize)%2) != 0) {
+			// repeat the program twice
+			prog = (byte*)typ->gc[1];
+			unrollgcprog1(mask, prog, &pos, false, true);
+		}
+		// atomic way to say mask[0] = 1
+		x = ((uint32*)mask)[0];
+		runtime·atomicstore((uint32*)mask, x|1);
+	}
+	runtime·unlock(&lock);
+}
+
+// mark the span of memory at v as having n blocks of the given size.
+// if leftover is true, there is left over space at the end of the span.
+void
+runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
+{
+	uintptr i, off, step;
+	byte *b;
+
+	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		runtime·throw("markspan: bad pointer");
+
+	// Find bits of the beginning of the span.
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+	if((off%wordsPerBitmapByte) != 0)
+		runtime·throw("markspan: unaligned length");
+
+	// Okay to use non-atomic ops here, because we control
+	// the entire span, and each bitmap byte has bits for only
+	// one span, so no other goroutines are changing these bitmap words.
+
+	if(size == PtrSize) {
+		// Possible only on 64-bits (minimal size class is 8 bytes).
+		// Poor man's memset(0x11).
+		if(0x11 != ((bitBoundary+BitsDead)<<gcBits) + (bitBoundary+BitsDead))
+			runtime·throw("markspan: bad bits");
+		if((n%(wordsPerBitmapByte*PtrSize)) != 0)
+			runtime·throw("markspan: unaligned length");
+		b = b - n/wordsPerBitmapByte + 1;	// find first byte
+		if(((uintptr)b%PtrSize) != 0)
+			runtime·throw("markspan: unaligned pointer");
+		for(i = 0; i != n; i += wordsPerBitmapByte*PtrSize, b += PtrSize)
+			*(uintptr*)b = (uintptr)0x1111111111111111ULL;  // bitBoundary+BitsDead
+		return;
+	}
+
+	if(leftover)
+		n++;	// mark a boundary just past end of last block too
+	step = size/(PtrSize*wordsPerBitmapByte);
+	for(i = 0; i != n; i++, b -= step)
+		*b = bitBoundary|(BitsDead<<2);
+}
+
+// unmark the span of memory at v of length n bytes.
+void
+runtime·unmarkspan(void *v, uintptr n)
+{
+	uintptr off;
+	byte *b;
+
+	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+		runtime·throw("markspan: bad pointer");
+
+	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
+	if((off % (PtrSize*wordsPerBitmapByte)) != 0)
+		runtime·throw("markspan: unaligned pointer");
+	b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+	n /= PtrSize;
+	if(n%(PtrSize*wordsPerBitmapByte) != 0)
+		runtime·throw("unmarkspan: unaligned length");
+	// Okay to use non-atomic ops here, because we control
+	// the entire span, and each bitmap word has bits for only
+	// one span, so no other goroutines are changing these
+	// bitmap words.
+	n /= wordsPerBitmapByte;
+	runtime·memclr(b - n + 1, n);
+}
+
+void
+runtime·MHeap_MapBits(MHeap *h)
+{
+	// Caller has added extra mappings to the arena.
+	// Add extra mappings of bitmap words as needed.
+	// We allocate extra bitmap pieces in chunks of bitmapChunk.
+	enum {
+		bitmapChunk = 8192
+	};
+	uintptr n;
+
+	n = (h->arena_used - h->arena_start) / (PtrSize*wordsPerBitmapByte);
+	n = ROUND(n, bitmapChunk);
+	n = ROUND(n, PhysPageSize);
+	if(h->bitmap_mapped >= n)
+		return;
+
+	runtime·SysMap(h->arena_start - n, n - h->bitmap_mapped, h->arena_reserved, &mstats.gc_sys);
+	h->bitmap_mapped = n;
+}
+
+static bool
+getgcmaskcb(Stkframe *frame, void *ctxt)
+{
+	Stkframe *frame0;
+
+	frame0 = ctxt;
+	if(frame->sp <= frame0->sp && frame0->sp < frame->varp) {
+		*frame0 = *frame;
+		return false;
+	}
+	return true;
+}
+
+// Returns GC type info for object p for testing.
+void
+runtime·getgcmask(byte *p, Type *t, byte **mask, uintptr *len)
+{
+	Stkframe frame;
+	uintptr i, n, off;
+	byte *base, bits, shift, *b;
+	bool (*cb)(Stkframe*, void*);
+
+	*mask = nil;
+	*len = 0;
+
+	// data
+	if(p >= runtime·data && p < runtime·edata) {
+		n = ((PtrType*)t)->elem->size;
+		*len = n/PtrSize;
+		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
+		for(i = 0; i < n; i += PtrSize) {
+			off = (p+i-runtime·data)/PtrSize;
+			bits = (runtime·gcdatamask.bytedata[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
+			(*mask)[i/PtrSize] = bits;
+		}
+		return;
+	}
+	// bss
+	if(p >= runtime·bss && p < runtime·ebss) {
+		n = ((PtrType*)t)->elem->size;
+		*len = n/PtrSize;
+		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
+		for(i = 0; i < n; i += PtrSize) {
+			off = (p+i-runtime·bss)/PtrSize;
+			bits = (runtime·gcbssmask.bytedata[off/PointersPerByte] >> ((off%PointersPerByte)*BitsPerPointer))&BitsMask;
+			(*mask)[i/PtrSize] = bits;
+		}
+		return;
+	}
+	// heap
+	if(runtime·mlookup(p, &base, &n, nil)) {
+		*len = n/PtrSize;
+		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
+		for(i = 0; i < n; i += PtrSize) {
+			off = (uintptr*)(base+i) - (uintptr*)runtime·mheap.arena_start;
+			b = runtime·mheap.arena_start - off/wordsPerBitmapByte - 1;
+			shift = (off % wordsPerBitmapByte) * gcBits;
+			bits = (*b >> (shift+2))&BitsMask;
+			(*mask)[i/PtrSize] = bits;
+		}
+		return;
+	}
+	// stack
+	frame.fn = nil;
+	frame.sp = (uintptr)p;
+	cb = getgcmaskcb;
+	runtime·gentraceback(g->m->curg->sched.pc, g->m->curg->sched.sp, 0, g->m->curg, 0, nil, 1000, &cb, &frame, 0);
+	if(frame.fn != nil) {
+		Func *f;
+		StackMap *stackmap;
+		BitVector bv;
+		uintptr size;
+		uintptr targetpc;
+		int32 pcdata;
+
+		f = frame.fn;
+		targetpc = frame.continpc;
+		if(targetpc == 0)
+			return;
+		if(targetpc != f->entry)
+			targetpc--;
+		pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
+		if(pcdata == -1)
+			return;
+		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
+		if(stackmap == nil || stackmap->n <= 0)
+			return;
+		bv = runtime·stackmapdata(stackmap, pcdata);
+		size = bv.n/BitsPerPointer*PtrSize;
+		n = ((PtrType*)t)->elem->size;
+		*len = n/PtrSize;
+		*mask = runtime·mallocgc(*len, nil, FlagNoScan);
+		for(i = 0; i < n; i += PtrSize) {
+			off = (p+i-(byte*)frame.varp+size)/PtrSize;
+			bits = (bv.bytedata[off*BitsPerPointer/8] >> ((off*BitsPerPointer)%8))&BitsMask;
+			(*mask)[i/PtrSize] = bits;
+		}
+	}
+}
+
+void runtime·gc_unixnanotime(int64 *now);
+
+int64
+runtime·unixnanotime(void)
+{
+	int64 now;
+
+	runtime·gc_unixnanotime(&now);
+	return now;
+}
diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go
new file mode 100644
index 000000000..cbf5e9cfd
--- /dev/null
+++ b/src/runtime/mgc0.go
@@ -0,0 +1,152 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// Called from C. Returns the Go type *m.
+func gc_m_ptr(ret *interface{}) {
+	*ret = (*m)(nil)
+}
+
+// Called from C. Returns the Go type *g.
+func gc_g_ptr(ret *interface{}) {
+	*ret = (*g)(nil)
+}
+
+// Called from C. Returns the Go type *itab.
+func gc_itab_ptr(ret *interface{}) {
+	*ret = (*itab)(nil)
+}
+
+func gc_unixnanotime(now *int64) {
+	sec, nsec := timenow()
+	*now = sec*1e9 + int64(nsec)
+}
+
+func freeOSMemory() {
+	gogc(2) // force GC and do eager sweep
+	onM(scavenge_m)
+}
+
+var poolcleanup func()
+
+func registerPoolCleanup(f func()) {
+	poolcleanup = f
+}
+
+func clearpools() {
+	// clear sync.Pools
+	if poolcleanup != nil {
+		poolcleanup()
+	}
+
+	for _, p := range &allp {
+		if p == nil {
+			break
+		}
+		// clear tinyalloc pool
+		if c := p.mcache; c != nil {
+			c.tiny = nil
+			c.tinysize = 0
+
+			// disconnect cached list before dropping it on the floor,
+			// so that a dangling ref to one entry does not pin all of them.
+			var sg, sgnext *sudog
+			for sg = c.sudogcache; sg != nil; sg = sgnext {
+				sgnext = sg.next
+				sg.next = nil
+			}
+			c.sudogcache = nil
+		}
+
+		// clear defer pools
+		for i := range p.deferpool {
+			// disconnect cached list before dropping it on the floor,
+			// so that a dangling ref to one entry does not pin all of them.
+			var d, dlink *_defer
+			for d = p.deferpool[i]; d != nil; d = dlink {
+				dlink = d.link
+				d.link = nil
+			}
+			p.deferpool[i] = nil
+		}
+	}
+}
+
+func gosweepone() uintptr
+func gosweepdone() bool
+
+func bgsweep() {
+	getg().issystem = true
+	for {
+		for gosweepone() != ^uintptr(0) {
+			sweep.nbgsweep++
+			Gosched()
+		}
+		lock(&gclock)
+		if !gosweepdone() {
+			// This can happen if a GC runs between
+			// gosweepone returning ^0 above
+			// and the lock being acquired.
+			unlock(&gclock)
+			continue
+		}
+		sweep.parked = true
+		goparkunlock(&gclock, "GC sweep wait")
+	}
+}
+
+// NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer,
+// but if we do that, Go inserts a write barrier on *dst = src.
+//go:nosplit
+func writebarrierptr(dst *uintptr, src uintptr) {
+	*dst = src
+}
+
+//go:nosplit
+func writebarrierstring(dst *[2]uintptr, src [2]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+}
+
+//go:nosplit
+func writebarrierslice(dst *[3]uintptr, src [3]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrieriface(dst *[2]uintptr, src [2]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+}
+
+//go:nosplit
+func writebarrierfat2(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+}
+
+//go:nosplit
+func writebarrierfat3(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat4(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
+	memmove(dst, src, typ.size)
+}
diff --git a/src/runtime/mgc0.h b/src/runtime/mgc0.h
new file mode 100644
index 000000000..64f818914
--- /dev/null
+++ b/src/runtime/mgc0.h
@@ -0,0 +1,78 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector (GC)
+
+enum {
+	// Four bits per word (see #defines below).
+	gcBits = 4,
+	wordsPerBitmapByte = 8/gcBits,
+
+	// GC type info programs.
+	// The programs allow to store type info required for GC in a compact form.
+	// Most importantly arrays take O(1) space instead of O(n).
+	// The program grammar is:
+	//
+	// Program = {Block} "insEnd"
+	// Block = Data | Array
+	// Data = "insData" DataSize DataBlock
+	// DataSize = int // size of the DataBlock in bit pairs, 1 byte
+	// DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
+	// Array = "insArray" ArrayLen Block "insArrayEnd"
+	// ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
+	//
+	// Each instruction (insData, insArray, etc) is 1 byte.
+	// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
+	// the program looks as:
+	//
+	// insData 3 (BitsMultiWord BitsSlice BitsScalar)
+	//	insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
+	//
+	// Total size of the program is 17 bytes (13 bytes on 32-bits).
+	// The corresponding GC mask would take 43 bytes (it would be repeated
+	// because the type has odd number of words).
+	insData = 1,
+	insArray,
+	insArrayEnd,
+	insEnd,
+
+	// Pointer map
+	BitsPerPointer	= 2,
+	BitsMask	= (1<<BitsPerPointer)-1,
+	PointersPerByte	= 8/BitsPerPointer,
+
+	// If you change these, also change scanblock.
+	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
+	BitsDead	= 0,
+	BitsScalar	= 1,
+	BitsPointer	= 2,
+	BitsMultiWord	= 3,
+	// BitsMultiWord will be set for the first word of a multi-word item.
+	// When it is set, one of the following will be set for the second word.
+	// NOT USED ANYMORE: BitsString	= 0,
+	// NOT USED ANYMORE: BitsSlice	= 1,
+	BitsIface	= 2,
+	BitsEface	= 3,
+
+	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
+	MaxGCMask	= 64,
+};
+
+// Bits in per-word bitmap.
+// #defines because we shift the values beyond 32 bits.
+//
+// Each word in the bitmap describes wordsPerBitmapWord words
+// of heap memory.  There are 4 bitmap bits dedicated to each heap word,
+// so on a 64-bit system there is one bitmap word per 16 heap words.
+//
+// The bitmap starts at mheap.arena_start and extends *backward* from
+// there.  On a 64-bit system the off'th word in the arena is tracked by
+// the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
+// the only difference is that the divisor is 8.)
+enum {
+	bitBoundary = 1, // boundary of an object
+	bitMarked = 2, // marked object
+	bitMask = bitBoundary | bitMarked,
+	bitPtrMask = BitsMask<<2,
+};
diff --git a/src/runtime/mheap.c b/src/runtime/mheap.c
new file mode 100644
index 000000000..bb203d5ce
--- /dev/null
+++ b/src/runtime/mheap.c
@@ -0,0 +1,889 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.h for overview.
+//
+// When a MSpan is in the heap free list, state == MSpanFree
+// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
+//
+// When a MSpan is allocated, state == MSpanInUse or MSpanStack
+// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+
+static MSpan *MHeap_AllocSpanLocked(MHeap*, uintptr);
+static void MHeap_FreeSpanLocked(MHeap*, MSpan*, bool, bool);
+static bool MHeap_Grow(MHeap*, uintptr);
+static MSpan *MHeap_AllocLarge(MHeap*, uintptr);
+static MSpan *BestFit(MSpan*, uintptr, MSpan*);
+
+static void
+RecordSpan(void *vh, byte *p)
+{
+	MHeap *h;
+	MSpan *s;
+	MSpan **all;
+	uint32 cap;
+
+	h = vh;
+	s = (MSpan*)p;
+	if(h->nspan >= h->nspancap) {
+		cap = 64*1024/sizeof(all[0]);
+		if(cap < h->nspancap*3/2)
+			cap = h->nspancap*3/2;
+		all = (MSpan**)runtime·sysAlloc(cap*sizeof(all[0]), &mstats.other_sys);
+		if(all == nil)
+			runtime·throw("runtime: cannot allocate memory");
+		if(h->allspans) {
+			runtime·memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
+			// Don't free the old array if it's referenced by sweep.
+			// See the comment in mgc0.c.
+			if(h->allspans != runtime·mheap.gcspans)
+				runtime·SysFree(h->allspans, h->nspancap*sizeof(all[0]), &mstats.other_sys);
+		}
+		h->allspans = all;
+		h->nspancap = cap;
+	}
+	h->allspans[h->nspan++] = s;
+}
+
+// Initialize the heap; fetch memory using alloc.
+void
+runtime·MHeap_Init(MHeap *h)
+{
+	uint32 i;
+
+	runtime·FixAlloc_Init(&h->spanalloc, sizeof(MSpan), RecordSpan, h, &mstats.mspan_sys);
+	runtime·FixAlloc_Init(&h->cachealloc, sizeof(MCache), nil, nil, &mstats.mcache_sys);
+	runtime·FixAlloc_Init(&h->specialfinalizeralloc, sizeof(SpecialFinalizer), nil, nil, &mstats.other_sys);
+	runtime·FixAlloc_Init(&h->specialprofilealloc, sizeof(SpecialProfile), nil, nil, &mstats.other_sys);
+	// h->mapcache needs no init
+	for(i=0; i<nelem(h->free); i++) {
+		runtime·MSpanList_Init(&h->free[i]);
+		runtime·MSpanList_Init(&h->busy[i]);
+	}
+	runtime·MSpanList_Init(&h->freelarge);
+	runtime·MSpanList_Init(&h->busylarge);
+	for(i=0; i<nelem(h->central); i++)
+		runtime·MCentral_Init(&h->central[i].mcentral, i);
+}
+
+void
+runtime·MHeap_MapSpans(MHeap *h)
+{
+	uintptr n;
+
+	// Map spans array, PageSize at a time.
+	n = (uintptr)h->arena_used;
+	n -= (uintptr)h->arena_start;
+	n = n / PageSize * sizeof(h->spans[0]);
+	n = ROUND(n, PhysPageSize);
+	if(h->spans_mapped >= n)
+		return;
+	runtime·SysMap((byte*)h->spans + h->spans_mapped, n - h->spans_mapped, h->arena_reserved, &mstats.other_sys);
+	h->spans_mapped = n;
+}
+
+// Sweeps spans in list until reclaims at least npages into heap.
+// Returns the actual number of pages reclaimed.
+static uintptr
+MHeap_ReclaimList(MHeap *h, MSpan *list, uintptr npages)
+{
+	MSpan *s;
+	uintptr n;
+	uint32 sg;
+
+	n = 0;
+	sg = runtime·mheap.sweepgen;
+retry:
+	for(s = list->next; s != list; s = s->next) {
+		if(s->sweepgen == sg-2 && runtime·cas(&s->sweepgen, sg-2, sg-1)) {
+			runtime·MSpanList_Remove(s);
+			// swept spans are at the end of the list
+			runtime·MSpanList_InsertBack(list, s);
+			runtime·unlock(&h->lock);
+			n += runtime·MSpan_Sweep(s, false);
+			runtime·lock(&h->lock);
+			if(n >= npages)
+				return n;
+			// the span could have been moved elsewhere
+			goto retry;
+		}
+		if(s->sweepgen == sg-1) {
+			// the span is being sweept by background sweeper, skip
+			continue;
+		}
+		// already swept empty span,
+		// all subsequent ones must also be either swept or in process of sweeping
+		break;
+	}
+	return n;
+}
+
+// Sweeps and reclaims at least npage pages into heap.
+// Called before allocating npage pages.
+static void
+MHeap_Reclaim(MHeap *h, uintptr npage)
+{
+	uintptr reclaimed, n;
+
+	// First try to sweep busy spans with large objects of size >= npage,
+	// this has good chances of reclaiming the necessary space.
+	for(n=npage; n < nelem(h->busy); n++) {
+		if(MHeap_ReclaimList(h, &h->busy[n], npage))
+			return;  // Bingo!
+	}
+
+	// Then -- even larger objects.
+	if(MHeap_ReclaimList(h, &h->busylarge, npage))
+		return;  // Bingo!
+
+	// Now try smaller objects.
+	// One such object is not enough, so we need to reclaim several of them.
+	reclaimed = 0;
+	for(n=0; n < npage && n < nelem(h->busy); n++) {
+		reclaimed += MHeap_ReclaimList(h, &h->busy[n], npage-reclaimed);
+		if(reclaimed >= npage)
+			return;
+	}
+
+	// Now sweep everything that is not yet swept.
+	runtime·unlock(&h->lock);
+	for(;;) {
+		n = runtime·sweepone();
+		if(n == -1)  // all spans are swept
+			break;
+		reclaimed += n;
+		if(reclaimed >= npage)
+			break;
+	}
+	runtime·lock(&h->lock);
+}
+
+// Allocate a new span of npage pages from the heap for GC'd memory
+// and record its size class in the HeapMap and HeapMapCache.
+static MSpan*
+mheap_alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large)
+{
+	MSpan *s;
+
+	if(g != g->m->g0)
+		runtime·throw("mheap_alloc not on M stack");
+	runtime·lock(&h->lock);
+
+	// To prevent excessive heap growth, before allocating n pages
+	// we need to sweep and reclaim at least n pages.
+	if(!h->sweepdone)
+		MHeap_Reclaim(h, npage);
+
+	// transfer stats from cache to global
+	mstats.heap_alloc += g->m->mcache->local_cachealloc;
+	g->m->mcache->local_cachealloc = 0;
+	mstats.tinyallocs += g->m->mcache->local_tinyallocs;
+	g->m->mcache->local_tinyallocs = 0;
+
+	s = MHeap_AllocSpanLocked(h, npage);
+	if(s != nil) {
+		// Record span info, because gc needs to be
+		// able to map interior pointer to containing span.
+		runtime·atomicstore(&s->sweepgen, h->sweepgen);
+		s->state = MSpanInUse;
+		s->freelist = nil;
+		s->ref = 0;
+		s->sizeclass = sizeclass;
+		s->elemsize = (sizeclass==0 ? s->npages<<PageShift : runtime·class_to_size[sizeclass]);
+
+		// update stats, sweep lists
+		if(large) {
+			mstats.heap_objects++;
+			mstats.heap_alloc += npage<<PageShift;
+			// Swept spans are at the end of lists.
+			if(s->npages < nelem(h->free))
+				runtime·MSpanList_InsertBack(&h->busy[s->npages], s);
+			else
+				runtime·MSpanList_InsertBack(&h->busylarge, s);
+		}
+	}
+	runtime·unlock(&h->lock);
+	return s;
+}
+
+static void
+mheap_alloc_m(G *gp)
+{
+	MHeap *h;
+	MSpan *s;
+
+	h = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	s = mheap_alloc(h, g->m->scalararg[0], g->m->scalararg[1], g->m->scalararg[2]);
+	g->m->ptrarg[0] = s;
+
+	runtime·gogo(&gp->sched);
+}
+
+MSpan*
+runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large, bool needzero)
+{
+	MSpan *s;
+	void (*fn)(G*);
+
+	// Don't do any operations that lock the heap on the G stack.
+	// It might trigger stack growth, and the stack growth code needs
+	// to be able to allocate heap.
+	if(g == g->m->g0) {
+		s = mheap_alloc(h, npage, sizeclass, large);
+	} else {
+		g->m->ptrarg[0] = h;
+		g->m->scalararg[0] = npage;
+		g->m->scalararg[1] = sizeclass;
+		g->m->scalararg[2] = large;
+		fn = mheap_alloc_m;
+		runtime·mcall(&fn);
+		s = g->m->ptrarg[0];
+		g->m->ptrarg[0] = nil;
+	}
+	if(s != nil) {
+		if(needzero && s->needzero)
+			runtime·memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
+		s->needzero = 0;
+	}
+	return s;
+}
+
+MSpan*
+runtime·MHeap_AllocStack(MHeap *h, uintptr npage)
+{
+	MSpan *s;
+
+	if(g != g->m->g0)
+		runtime·throw("mheap_allocstack not on M stack");
+	runtime·lock(&h->lock);
+	s = MHeap_AllocSpanLocked(h, npage);
+	if(s != nil) {
+		s->state = MSpanStack;
+		s->freelist = nil;
+		s->ref = 0;
+		mstats.stacks_inuse += s->npages<<PageShift;
+	}
+	runtime·unlock(&h->lock);
+	return s;
+}
+
+// Allocates a span of the given size.  h must be locked.
+// The returned span has been removed from the
+// free list, but its state is still MSpanFree.
+static MSpan*
+MHeap_AllocSpanLocked(MHeap *h, uintptr npage)
+{
+	uintptr n;
+	MSpan *s, *t;
+	pageID p;
+
+	// Try in fixed-size lists up to max.
+	for(n=npage; n < nelem(h->free); n++) {
+		if(!runtime·MSpanList_IsEmpty(&h->free[n])) {
+			s = h->free[n].next;
+			goto HaveSpan;
+		}
+	}
+
+	// Best fit in list of large spans.
+	if((s = MHeap_AllocLarge(h, npage)) == nil) {
+		if(!MHeap_Grow(h, npage))
+			return nil;
+		if((s = MHeap_AllocLarge(h, npage)) == nil)
+			return nil;
+	}
+
+HaveSpan:
+	// Mark span in use.
+	if(s->state != MSpanFree)
+		runtime·throw("MHeap_AllocLocked - MSpan not free");
+	if(s->npages < npage)
+		runtime·throw("MHeap_AllocLocked - bad npages");
+	runtime·MSpanList_Remove(s);
+	if(s->next != nil || s->prev != nil)
+		runtime·throw("still in list");
+	if(s->npreleased > 0) {
+		runtime·SysUsed((void*)(s->start<<PageShift), s->npages<<PageShift);
+		mstats.heap_released -= s->npreleased<<PageShift;
+		s->npreleased = 0;
+	}
+
+	if(s->npages > npage) {
+		// Trim extra and put it back in the heap.
+		t = runtime·FixAlloc_Alloc(&h->spanalloc);
+		runtime·MSpan_Init(t, s->start + npage, s->npages - npage);
+		s->npages = npage;
+		p = t->start;
+		p -= ((uintptr)h->arena_start>>PageShift);
+		if(p > 0)
+			h->spans[p-1] = s;
+		h->spans[p] = t;
+		h->spans[p+t->npages-1] = t;
+		t->needzero = s->needzero;
+		s->state = MSpanStack; // prevent coalescing with s
+		t->state = MSpanStack;
+		MHeap_FreeSpanLocked(h, t, false, false);
+		t->unusedsince = s->unusedsince; // preserve age (TODO: wrong: t is possibly merged and/or deallocated at this point)
+		s->state = MSpanFree;
+	}
+	s->unusedsince = 0;
+
+	p = s->start;
+	p -= ((uintptr)h->arena_start>>PageShift);
+	for(n=0; n<npage; n++)
+		h->spans[p+n] = s;
+
+	mstats.heap_inuse += npage<<PageShift;
+	mstats.heap_idle -= npage<<PageShift;
+
+	//runtime·printf("spanalloc %p\n", s->start << PageShift);
+	if(s->next != nil || s->prev != nil)
+		runtime·throw("still in list");
+	return s;
+}
+
+// Allocate a span of exactly npage pages from the list of large spans.
+static MSpan*
+MHeap_AllocLarge(MHeap *h, uintptr npage)
+{
+	return BestFit(&h->freelarge, npage, nil);
+}
+
+// Search list for smallest span with >= npage pages.
+// If there are multiple smallest spans, take the one
+// with the earliest starting address.
+static MSpan*
+BestFit(MSpan *list, uintptr npage, MSpan *best)
+{
+	MSpan *s;
+
+	for(s=list->next; s != list; s=s->next) {
+		if(s->npages < npage)
+			continue;
+		if(best == nil
+		|| s->npages < best->npages
+		|| (s->npages == best->npages && s->start < best->start))
+			best = s;
+	}
+	return best;
+}
+
+// Try to add at least npage pages of memory to the heap,
+// returning whether it worked.
+static bool
+MHeap_Grow(MHeap *h, uintptr npage)
+{
+	uintptr ask;
+	void *v;
+	MSpan *s;
+	pageID p;
+
+	// Ask for a big chunk, to reduce the number of mappings
+	// the operating system needs to track; also amortizes
+	// the overhead of an operating system mapping.
+	// Allocate a multiple of 64kB.
+	npage = ROUND(npage, (64<<10)/PageSize);
+	ask = npage<<PageShift;
+	if(ask < HeapAllocChunk)
+		ask = HeapAllocChunk;
+
+	v = runtime·MHeap_SysAlloc(h, ask);
+	if(v == nil) {
+		if(ask > (npage<<PageShift)) {
+			ask = npage<<PageShift;
+			v = runtime·MHeap_SysAlloc(h, ask);
+		}
+		if(v == nil) {
+			runtime·printf("runtime: out of memory: cannot allocate %D-byte block (%D in use)\n", (uint64)ask, mstats.heap_sys);
+			return false;
+		}
+	}
+
+	// Create a fake "in use" span and free it, so that the
+	// right coalescing happens.
+	s = runtime·FixAlloc_Alloc(&h->spanalloc);
+	runtime·MSpan_Init(s, (uintptr)v>>PageShift, ask>>PageShift);
+	p = s->start;
+	p -= ((uintptr)h->arena_start>>PageShift);
+	h->spans[p] = s;
+	h->spans[p + s->npages - 1] = s;
+	runtime·atomicstore(&s->sweepgen, h->sweepgen);
+	s->state = MSpanInUse;
+	MHeap_FreeSpanLocked(h, s, false, true);
+	return true;
+}
+
+// Look up the span at the given address.
+// Address is guaranteed to be in map
+// and is guaranteed to be start or end of span.
+MSpan*
+runtime·MHeap_Lookup(MHeap *h, void *v)
+{
+	uintptr p;
+	
+	p = (uintptr)v;
+	p -= (uintptr)h->arena_start;
+	return h->spans[p >> PageShift];
+}
+
+// Look up the span at the given address.
+// Address is *not* guaranteed to be in map
+// and may be anywhere in the span.
+// Map entries for the middle of a span are only
+// valid for allocated spans.  Free spans may have
+// other garbage in their middles, so we have to
+// check for that.
+MSpan*
+runtime·MHeap_LookupMaybe(MHeap *h, void *v)
+{
+	MSpan *s;
+	pageID p, q;
+
+	if((byte*)v < h->arena_start || (byte*)v >= h->arena_used)
+		return nil;
+	p = (uintptr)v>>PageShift;
+	q = p;
+	q -= (uintptr)h->arena_start >> PageShift;
+	s = h->spans[q];
+	if(s == nil || p < s->start || v >= s->limit || s->state != MSpanInUse)
+		return nil;
+	return s;
+}
+
+// Free the span back into the heap.
+static void
+mheap_free(MHeap *h, MSpan *s, int32 acct)
+{
+	if(g != g->m->g0)
+		runtime·throw("mheap_free not on M stack");
+	runtime·lock(&h->lock);
+	mstats.heap_alloc += g->m->mcache->local_cachealloc;
+	g->m->mcache->local_cachealloc = 0;
+	mstats.tinyallocs += g->m->mcache->local_tinyallocs;
+	g->m->mcache->local_tinyallocs = 0;
+	if(acct) {
+		mstats.heap_alloc -= s->npages<<PageShift;
+		mstats.heap_objects--;
+	}
+	MHeap_FreeSpanLocked(h, s, true, true);
+	runtime·unlock(&h->lock);
+}
+
+static void
+mheap_free_m(G *gp)
+{
+	MHeap *h;
+	MSpan *s;
+	
+	h = g->m->ptrarg[0];
+	s = g->m->ptrarg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+	mheap_free(h, s, g->m->scalararg[0]);
+	runtime·gogo(&gp->sched);
+}
+
+void
+runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct)
+{
+	void (*fn)(G*);
+
+	if(g == g->m->g0) {
+		mheap_free(h, s, acct);
+	} else {
+		g->m->ptrarg[0] = h;
+		g->m->ptrarg[1] = s;
+		g->m->scalararg[0] = acct;
+		fn = mheap_free_m;
+		runtime·mcall(&fn);
+	}
+}
+
+void
+runtime·MHeap_FreeStack(MHeap *h, MSpan *s)
+{
+	if(g != g->m->g0)
+		runtime·throw("mheap_freestack not on M stack");
+	s->needzero = 1;
+	runtime·lock(&h->lock);
+	mstats.stacks_inuse -= s->npages<<PageShift;
+	MHeap_FreeSpanLocked(h, s, true, true);
+	runtime·unlock(&h->lock);
+}
+
+static void
+MHeap_FreeSpanLocked(MHeap *h, MSpan *s, bool acctinuse, bool acctidle)
+{
+	MSpan *t;
+	pageID p;
+
+	switch(s->state) {
+	case MSpanStack:
+		if(s->ref != 0)
+			runtime·throw("MHeap_FreeSpanLocked - invalid stack free");
+		break;
+	case MSpanInUse:
+		if(s->ref != 0 || s->sweepgen != h->sweepgen) {
+			runtime·printf("MHeap_FreeSpanLocked - span %p ptr %p ref %d sweepgen %d/%d\n",
+				       s, s->start<<PageShift, s->ref, s->sweepgen, h->sweepgen);
+			runtime·throw("MHeap_FreeSpanLocked - invalid free");
+		}
+		break;
+	default:
+		runtime·throw("MHeap_FreeSpanLocked - invalid span state");
+		break;
+	}
+	if(acctinuse)
+		mstats.heap_inuse -= s->npages<<PageShift;
+	if(acctidle)
+		mstats.heap_idle += s->npages<<PageShift;
+	s->state = MSpanFree;
+	runtime·MSpanList_Remove(s);
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	s->unusedsince = runtime·nanotime();
+	s->npreleased = 0;
+
+	// Coalesce with earlier, later spans.
+	p = s->start;
+	p -= (uintptr)h->arena_start >> PageShift;
+	if(p > 0 && (t = h->spans[p-1]) != nil && t->state != MSpanInUse && t->state != MSpanStack) {
+		s->start = t->start;
+		s->npages += t->npages;
+		s->npreleased = t->npreleased; // absorb released pages
+		s->needzero |= t->needzero;
+		p -= t->npages;
+		h->spans[p] = s;
+		runtime·MSpanList_Remove(t);
+		t->state = MSpanDead;
+		runtime·FixAlloc_Free(&h->spanalloc, t);
+	}
+	if((p+s->npages)*sizeof(h->spans[0]) < h->spans_mapped && (t = h->spans[p+s->npages]) != nil && t->state != MSpanInUse && t->state != MSpanStack) {
+		s->npages += t->npages;
+		s->npreleased += t->npreleased;
+		s->needzero |= t->needzero;
+		h->spans[p + s->npages - 1] = s;
+		runtime·MSpanList_Remove(t);
+		t->state = MSpanDead;
+		runtime·FixAlloc_Free(&h->spanalloc, t);
+	}
+
+	// Insert s into appropriate list.
+	if(s->npages < nelem(h->free))
+		runtime·MSpanList_Insert(&h->free[s->npages], s);
+	else
+		runtime·MSpanList_Insert(&h->freelarge, s);
+}
+
+static uintptr
+scavengelist(MSpan *list, uint64 now, uint64 limit)
+{
+	uintptr released, sumreleased;
+	MSpan *s;
+
+	if(runtime·MSpanList_IsEmpty(list))
+		return 0;
+
+	sumreleased = 0;
+	for(s=list->next; s != list; s=s->next) {
+		if((now - s->unusedsince) > limit && s->npreleased != s->npages) {
+			released = (s->npages - s->npreleased) << PageShift;
+			mstats.heap_released += released;
+			sumreleased += released;
+			s->npreleased = s->npages;
+			runtime·SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
+		}
+	}
+	return sumreleased;
+}
+
+void
+runtime·MHeap_Scavenge(int32 k, uint64 now, uint64 limit)
+{
+	uint32 i;
+	uintptr sumreleased;
+	MHeap *h;
+	
+	h = &runtime·mheap;
+	runtime·lock(&h->lock);
+	sumreleased = 0;
+	for(i=0; i < nelem(h->free); i++)
+		sumreleased += scavengelist(&h->free[i], now, limit);
+	sumreleased += scavengelist(&h->freelarge, now, limit);
+	runtime·unlock(&h->lock);
+
+	if(runtime·debug.gctrace > 0) {
+		if(sumreleased > 0)
+			runtime·printf("scvg%d: %D MB released\n", k, (uint64)sumreleased>>20);
+		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
+		// But we can't call ReadMemStats on g0 holding locks.
+		runtime·printf("scvg%d: inuse: %D, idle: %D, sys: %D, released: %D, consumed: %D (MB)\n",
+			k, mstats.heap_inuse>>20, mstats.heap_idle>>20, mstats.heap_sys>>20,
+			mstats.heap_released>>20, (mstats.heap_sys - mstats.heap_released)>>20);
+	}
+}
+
+void
+runtime·scavenge_m(void)
+{
+	runtime·MHeap_Scavenge(-1, ~(uintptr)0, 0);
+}
+
+// Initialize a new span with the given start and npages.
+void
+runtime·MSpan_Init(MSpan *span, pageID start, uintptr npages)
+{
+	span->next = nil;
+	span->prev = nil;
+	span->start = start;
+	span->npages = npages;
+	span->freelist = nil;
+	span->ref = 0;
+	span->sizeclass = 0;
+	span->incache = false;
+	span->elemsize = 0;
+	span->state = MSpanDead;
+	span->unusedsince = 0;
+	span->npreleased = 0;
+	span->specialLock.key = 0;
+	span->specials = nil;
+	span->needzero = 0;
+}
+
+// Initialize an empty doubly-linked list.
+void
+runtime·MSpanList_Init(MSpan *list)
+{
+	list->state = MSpanListHead;
+	list->next = list;
+	list->prev = list;
+}
+
+void
+runtime·MSpanList_Remove(MSpan *span)
+{
+	if(span->prev == nil && span->next == nil)
+		return;
+	span->prev->next = span->next;
+	span->next->prev = span->prev;
+	span->prev = nil;
+	span->next = nil;
+}
+
+bool
+runtime·MSpanList_IsEmpty(MSpan *list)
+{
+	return list->next == list;
+}
+
+void
+runtime·MSpanList_Insert(MSpan *list, MSpan *span)
+{
+	if(span->next != nil || span->prev != nil) {
+		runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
+		runtime·throw("MSpanList_Insert");
+	}
+	span->next = list->next;
+	span->prev = list;
+	span->next->prev = span;
+	span->prev->next = span;
+}
+
+void
+runtime·MSpanList_InsertBack(MSpan *list, MSpan *span)
+{
+	if(span->next != nil || span->prev != nil) {
+		runtime·printf("failed MSpanList_Insert %p %p %p\n", span, span->next, span->prev);
+		runtime·throw("MSpanList_Insert");
+	}
+	span->next = list;
+	span->prev = list->prev;
+	span->next->prev = span;
+	span->prev->next = span;
+}
+
+// Adds the special record s to the list of special records for
+// the object p.  All fields of s should be filled in except for
+// offset & next, which this routine will fill in.
+// Returns true if the special was successfully added, false otherwise.
+// (The add will fail only if a record with the same p and s->kind
+//  already exists.)
+static bool
+addspecial(void *p, Special *s)
+{
+	MSpan *span;
+	Special **t, *x;
+	uintptr offset;
+	byte kind;
+
+	span = runtime·MHeap_LookupMaybe(&runtime·mheap, p);
+	if(span == nil)
+		runtime·throw("addspecial on invalid pointer");
+
+	// Ensure that the span is swept.
+	// GC accesses specials list w/o locks. And it's just much safer.
+	g->m->locks++;
+	runtime·MSpan_EnsureSwept(span);
+
+	offset = (uintptr)p - (span->start << PageShift);
+	kind = s->kind;
+
+	runtime·lock(&span->specialLock);
+
+	// Find splice point, check for existing record.
+	t = &span->specials;
+	while((x = *t) != nil) {
+		if(offset == x->offset && kind == x->kind) {
+			runtime·unlock(&span->specialLock);
+			g->m->locks--;
+			return false; // already exists
+		}
+		if(offset < x->offset || (offset == x->offset && kind < x->kind))
+			break;
+		t = &x->next;
+	}
+	// Splice in record, fill in offset.
+	s->offset = offset;
+	s->next = x;
+	*t = s;
+	runtime·unlock(&span->specialLock);
+	g->m->locks--;
+	return true;
+}
+
+// Removes the Special record of the given kind for the object p.
+// Returns the record if the record existed, nil otherwise.
+// The caller must FixAlloc_Free the result.
+static Special*
+removespecial(void *p, byte kind)
+{
+	MSpan *span;
+	Special *s, **t;
+	uintptr offset;
+
+	span = runtime·MHeap_LookupMaybe(&runtime·mheap, p);
+	if(span == nil)
+		runtime·throw("removespecial on invalid pointer");
+
+	// Ensure that the span is swept.
+	// GC accesses specials list w/o locks. And it's just much safer.
+	g->m->locks++;
+	runtime·MSpan_EnsureSwept(span);
+
+	offset = (uintptr)p - (span->start << PageShift);
+
+	runtime·lock(&span->specialLock);
+	t = &span->specials;
+	while((s = *t) != nil) {
+		// This function is used for finalizers only, so we don't check for
+		// "interior" specials (p must be exactly equal to s->offset).
+		if(offset == s->offset && kind == s->kind) {
+			*t = s->next;
+			runtime·unlock(&span->specialLock);
+			g->m->locks--;
+			return s;
+		}
+		t = &s->next;
+	}
+	runtime·unlock(&span->specialLock);
+	g->m->locks--;
+	return nil;
+}
+
+// Adds a finalizer to the object p.  Returns true if it succeeded.
+bool
+runtime·addfinalizer(void *p, FuncVal *f, uintptr nret, Type *fint, PtrType *ot)
+{
+	SpecialFinalizer *s;
+
+	runtime·lock(&runtime·mheap.speciallock);
+	s = runtime·FixAlloc_Alloc(&runtime·mheap.specialfinalizeralloc);
+	runtime·unlock(&runtime·mheap.speciallock);
+	s->special.kind = KindSpecialFinalizer;
+	s->fn = f;
+	s->nret = nret;
+	s->fint = fint;
+	s->ot = ot;
+	if(addspecial(p, &s->special))
+		return true;
+
+	// There was an old finalizer
+	runtime·lock(&runtime·mheap.speciallock);
+	runtime·FixAlloc_Free(&runtime·mheap.specialfinalizeralloc, s);
+	runtime·unlock(&runtime·mheap.speciallock);
+	return false;
+}
+
+// Removes the finalizer (if any) from the object p.
+void
+runtime·removefinalizer(void *p)
+{
+	SpecialFinalizer *s;
+
+	s = (SpecialFinalizer*)removespecial(p, KindSpecialFinalizer);
+	if(s == nil)
+		return; // there wasn't a finalizer to remove
+	runtime·lock(&runtime·mheap.speciallock);
+	runtime·FixAlloc_Free(&runtime·mheap.specialfinalizeralloc, s);
+	runtime·unlock(&runtime·mheap.speciallock);
+}
+
+// Set the heap profile bucket associated with addr to b.
+void
+runtime·setprofilebucket_m(void)
+{	
+	void *p;
+	Bucket *b;
+	SpecialProfile *s;
+	
+	p = g->m->ptrarg[0];
+	b = g->m->ptrarg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+
+	runtime·lock(&runtime·mheap.speciallock);
+	s = runtime·FixAlloc_Alloc(&runtime·mheap.specialprofilealloc);
+	runtime·unlock(&runtime·mheap.speciallock);
+	s->special.kind = KindSpecialProfile;
+	s->b = b;
+	if(!addspecial(p, &s->special))
+		runtime·throw("setprofilebucket: profile already set");
+}
+
+// Do whatever cleanup needs to be done to deallocate s.  It has
+// already been unlinked from the MSpan specials list.
+// Returns true if we should keep working on deallocating p.
+bool
+runtime·freespecial(Special *s, void *p, uintptr size, bool freed)
+{
+	SpecialFinalizer *sf;
+	SpecialProfile *sp;
+
+	switch(s->kind) {
+	case KindSpecialFinalizer:
+		sf = (SpecialFinalizer*)s;
+		runtime·queuefinalizer(p, sf->fn, sf->nret, sf->fint, sf->ot);
+		runtime·lock(&runtime·mheap.speciallock);
+		runtime·FixAlloc_Free(&runtime·mheap.specialfinalizeralloc, sf);
+		runtime·unlock(&runtime·mheap.speciallock);
+		return false; // don't free p until finalizer is done
+	case KindSpecialProfile:
+		sp = (SpecialProfile*)s;
+		runtime·mProf_Free(sp->b, size, freed);
+		runtime·lock(&runtime·mheap.speciallock);
+		runtime·FixAlloc_Free(&runtime·mheap.specialprofilealloc, sp);
+		runtime·unlock(&runtime·mheap.speciallock);
+		return true;
+	default:
+		runtime·throw("bad special kind");
+		return true;
+	}
+}
diff --git a/src/runtime/mknacl.sh b/src/runtime/mknacl.sh
new file mode 100644
index 000000000..47fb7bd88
--- /dev/null
+++ b/src/runtime/mknacl.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright 2013 The Go Authors.  All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+cat /Users/rsc/pub/native_client/src/trusted/service_runtime/include/bits/nacl_syscalls.h |
+	awk '
+	BEGIN {
+		printf("// generated by mknacl.sh - do not edit\n")
+	}
+	NF==3 && $1=="#define" && $2~/^NACL_sys_/ {
+		name=$2
+		sub(/^NACL_sys_/, "SYS_", name)
+		printf("#define %s %s\n", name, $3)
+	}' >syscall_nacl.h
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
new file mode 100644
index 000000000..d409c6c30
--- /dev/null
+++ b/src/runtime/mprof.go
@@ -0,0 +1,672 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc profiling.
+// Patterned after tcmalloc's algorithms; shorter code.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// NOTE(rsc): Everything here could use cas if contention became an issue.
+var proflock mutex
+
+// All memory allocations are local and do not escape outside of the profiler.
+// The profiler is forbidden from referring to garbage-collected memory.
+
+const (
+	// profile types
+	memProfile bucketType = 1 + iota
+	blockProfile
+
+	// size of bucket hash table
+	buckHashSize = 179999
+
+	// max depth of stack to record in bucket
+	maxStack = 32
+)
+
+type bucketType int
+
+// A bucket holds per-call-stack profiling information.
+// The representation is a bit sleazy, inherited from C.
+// This struct defines the bucket header. It is followed in
+// memory by the stack words and then the actual record
+// data, either a memRecord or a blockRecord.
+//
+// Per-call-stack profiling information.
+// Lookup by hashing call stack into a linked-list hash table.
+type bucket struct {
+	next    *bucket
+	allnext *bucket
+	typ     bucketType // memBucket or blockBucket
+	hash    uintptr
+	size    uintptr
+	nstk    uintptr
+}
+
+// A memRecord is the bucket data for a bucket of type memProfile,
+// part of the memory profile.
+type memRecord struct {
+	// The following complex 3-stage scheme of stats accumulation
+	// is required to obtain a consistent picture of mallocs and frees
+	// for some point in time.
+	// The problem is that mallocs come in real time, while frees
+	// come only after a GC during concurrent sweeping. So if we would
+	// naively count them, we would get a skew toward mallocs.
+	//
+	// Mallocs are accounted in recent stats.
+	// Explicit frees are accounted in recent stats.
+	// GC frees are accounted in prev stats.
+	// After GC prev stats are added to final stats and
+	// recent stats are moved into prev stats.
+	allocs      uintptr
+	frees       uintptr
+	alloc_bytes uintptr
+	free_bytes  uintptr
+
+	// changes between next-to-last GC and last GC
+	prev_allocs      uintptr
+	prev_frees       uintptr
+	prev_alloc_bytes uintptr
+	prev_free_bytes  uintptr
+
+	// changes since last GC
+	recent_allocs      uintptr
+	recent_frees       uintptr
+	recent_alloc_bytes uintptr
+	recent_free_bytes  uintptr
+}
+
+// A blockRecord is the bucket data for a bucket of type blockProfile,
+// part of the blocking profile.
+type blockRecord struct {
+	count  int64
+	cycles int64
+}
+
+var (
+	mbuckets  *bucket // memory profile buckets
+	bbuckets  *bucket // blocking profile buckets
+	buckhash  *[179999]*bucket
+	bucketmem uintptr
+)
+
+// newBucket allocates a bucket with the given type and number of stack entries.
+func newBucket(typ bucketType, nstk int) *bucket {
+	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
+	switch typ {
+	default:
+		gothrow("invalid profile bucket type")
+	case memProfile:
+		size += unsafe.Sizeof(memRecord{})
+	case blockProfile:
+		size += unsafe.Sizeof(blockRecord{})
+	}
+
+	b := (*bucket)(persistentalloc(size, 0, &memstats.buckhash_sys))
+	bucketmem += size
+	b.typ = typ
+	b.nstk = uintptr(nstk)
+	return b
+}
+
+// stk returns the slice in b holding the stack.
+func (b *bucket) stk() []uintptr {
+	stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
+	return stk[:b.nstk:b.nstk]
+}
+
+// mp returns the memRecord associated with the memProfile bucket b.
+func (b *bucket) mp() *memRecord {
+	if b.typ != memProfile {
+		gothrow("bad use of bucket.mp")
+	}
+	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
+	return (*memRecord)(data)
+}
+
+// bp returns the blockRecord associated with the blockProfile bucket b.
+func (b *bucket) bp() *blockRecord {
+	if b.typ != blockProfile {
+		gothrow("bad use of bucket.bp")
+	}
+	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
+	return (*blockRecord)(data)
+}
+
+// Return the bucket for stk[0:nstk], allocating new bucket if needed.
+func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
+	if buckhash == nil {
+		buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys))
+		if buckhash == nil {
+			gothrow("runtime: cannot allocate memory")
+		}
+	}
+
+	// Hash stack.
+	var h uintptr
+	for _, pc := range stk {
+		h += pc
+		h += h << 10
+		h ^= h >> 6
+	}
+	// hash in size
+	h += size
+	h += h << 10
+	h ^= h >> 6
+	// finalize
+	h += h << 3
+	h ^= h >> 11
+
+	i := int(h % buckHashSize)
+	for b := buckhash[i]; b != nil; b = b.next {
+		if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
+			return b
+		}
+	}
+
+	if !alloc {
+		return nil
+	}
+
+	// Create new bucket.
+	b := newBucket(typ, len(stk))
+	copy(b.stk(), stk)
+	b.hash = h
+	b.size = size
+	b.next = buckhash[i]
+	buckhash[i] = b
+	if typ == memProfile {
+		b.allnext = mbuckets
+		mbuckets = b
+	} else {
+		b.allnext = bbuckets
+		bbuckets = b
+	}
+	return b
+}
+
+func sysAlloc(n uintptr, stat *uint64) unsafe.Pointer
+
+func eqslice(x, y []uintptr) bool {
+	if len(x) != len(y) {
+		return false
+	}
+	for i, xi := range x {
+		if xi != y[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func mprof_GC() {
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		mp.allocs += mp.prev_allocs
+		mp.frees += mp.prev_frees
+		mp.alloc_bytes += mp.prev_alloc_bytes
+		mp.free_bytes += mp.prev_free_bytes
+
+		mp.prev_allocs = mp.recent_allocs
+		mp.prev_frees = mp.recent_frees
+		mp.prev_alloc_bytes = mp.recent_alloc_bytes
+		mp.prev_free_bytes = mp.recent_free_bytes
+
+		mp.recent_allocs = 0
+		mp.recent_frees = 0
+		mp.recent_alloc_bytes = 0
+		mp.recent_free_bytes = 0
+	}
+}
+
+// Record that a gc just happened: all the 'recent' statistics are now real.
+func mProf_GC() {
+	lock(&proflock)
+	mprof_GC()
+	unlock(&proflock)
+}
+
+// Called by malloc to record a profiled block.
+func mProf_Malloc(p unsafe.Pointer, size uintptr) {
+	var stk [maxStack]uintptr
+	nstk := callers(4, &stk[0], len(stk))
+	lock(&proflock)
+	b := stkbucket(memProfile, size, stk[:nstk], true)
+	mp := b.mp()
+	mp.recent_allocs++
+	mp.recent_alloc_bytes += size
+	unlock(&proflock)
+
+	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
+	// This reduces potential contention and chances of deadlocks.
+	// Since the object must be alive during call to mProf_Malloc,
+	// it's fine to do this non-atomically.
+	setprofilebucket(p, b)
+}
+
+func setprofilebucket_m() // mheap.c
+
+func setprofilebucket(p unsafe.Pointer, b *bucket) {
+	g := getg()
+	g.m.ptrarg[0] = p
+	g.m.ptrarg[1] = unsafe.Pointer(b)
+	onM(setprofilebucket_m)
+}
+
+// Called when freeing a profiled block.
+func mProf_Free(b *bucket, size uintptr, freed bool) {
+	lock(&proflock)
+	mp := b.mp()
+	if freed {
+		mp.recent_frees++
+		mp.recent_free_bytes += size
+	} else {
+		mp.prev_frees++
+		mp.prev_free_bytes += size
+	}
+	unlock(&proflock)
+}
+
+var blockprofilerate uint64 // in CPU ticks
+
+// SetBlockProfileRate controls the fraction of goroutine blocking events
+// that are reported in the blocking profile.  The profiler aims to sample
+// an average of one blocking event per rate nanoseconds spent blocked.
+//
+// To include every blocking event in the profile, pass rate = 1.
+// To turn off profiling entirely, pass rate <= 0.
+func SetBlockProfileRate(rate int) {
+	var r int64
+	if rate <= 0 {
+		r = 0 // disable profiling
+	} else if rate == 1 {
+		r = 1 // profile everything
+	} else {
+		// convert ns to cycles, use float64 to prevent overflow during multiplication
+		r = int64(float64(rate) * float64(tickspersecond()) / (1000 * 1000 * 1000))
+		if r == 0 {
+			r = 1
+		}
+	}
+
+	atomicstore64(&blockprofilerate, uint64(r))
+}
+
+func blockevent(cycles int64, skip int) {
+	if cycles <= 0 {
+		cycles = 1
+	}
+	rate := int64(atomicload64(&blockprofilerate))
+	if rate <= 0 || (rate > cycles && int64(fastrand1())%rate > cycles) {
+		return
+	}
+	gp := getg()
+	var nstk int
+	var stk [maxStack]uintptr
+	if gp.m.curg == nil || gp.m.curg == gp {
+		nstk = callers(skip, &stk[0], len(stk))
+	} else {
+		nstk = gcallers(gp.m.curg, skip, &stk[0], len(stk))
+	}
+	lock(&proflock)
+	b := stkbucket(blockProfile, 0, stk[:nstk], true)
+	b.bp().count++
+	b.bp().cycles += cycles
+	unlock(&proflock)
+}
+
+// Go interface to profile data.
+
+// A StackRecord describes a single execution stack.
+type StackRecord struct {
+	Stack0 [32]uintptr // stack trace for this record; ends at first 0 entry
+}
+
+// Stack returns the stack trace associated with the record,
+// a prefix of r.Stack0.
+func (r *StackRecord) Stack() []uintptr {
+	for i, v := range r.Stack0 {
+		if v == 0 {
+			return r.Stack0[0:i]
+		}
+	}
+	return r.Stack0[0:]
+}
+
+// MemProfileRate controls the fraction of memory allocations
+// that are recorded and reported in the memory profile.
+// The profiler aims to sample an average of
+// one allocation per MemProfileRate bytes allocated.
+//
+// To include every allocated block in the profile, set MemProfileRate to 1.
+// To turn off profiling entirely, set MemProfileRate to 0.
+//
+// The tools that process the memory profiles assume that the
+// profile rate is constant across the lifetime of the program
+// and equal to the current value.  Programs that change the
+// memory profiling rate should do so just once, as early as
+// possible in the execution of the program (for example,
+// at the beginning of main).
+var MemProfileRate int = 512 * 1024
+
+// A MemProfileRecord describes the live objects allocated
+// by a particular call sequence (stack trace).
+type MemProfileRecord struct {
+	AllocBytes, FreeBytes     int64       // number of bytes allocated, freed
+	AllocObjects, FreeObjects int64       // number of objects allocated, freed
+	Stack0                    [32]uintptr // stack trace for this record; ends at first 0 entry
+}
+
+// InUseBytes returns the number of bytes in use (AllocBytes - FreeBytes).
+func (r *MemProfileRecord) InUseBytes() int64 { return r.AllocBytes - r.FreeBytes }
+
+// InUseObjects returns the number of objects in use (AllocObjects - FreeObjects).
+func (r *MemProfileRecord) InUseObjects() int64 {
+	return r.AllocObjects - r.FreeObjects
+}
+
+// Stack returns the stack trace associated with the record,
+// a prefix of r.Stack0.
+func (r *MemProfileRecord) Stack() []uintptr {
+	for i, v := range r.Stack0 {
+		if v == 0 {
+			return r.Stack0[0:i]
+		}
+	}
+	return r.Stack0[0:]
+}
+
+// MemProfile returns n, the number of records in the current memory profile.
+// If len(p) >= n, MemProfile copies the profile into p and returns n, true.
+// If len(p) < n, MemProfile does not change p and returns n, false.
+//
+// If inuseZero is true, the profile includes allocation records
+// where r.AllocBytes > 0 but r.AllocBytes == r.FreeBytes.
+// These are sites where memory was allocated, but it has all
+// been released back to the runtime.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.memprofile flag instead
+// of calling MemProfile directly.
+func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
+	lock(&proflock)
+	clear := true
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		if inuseZero || mp.alloc_bytes != mp.free_bytes {
+			n++
+		}
+		if mp.allocs != 0 || mp.frees != 0 {
+			clear = false
+		}
+	}
+	if clear {
+		// Absolutely no data, suggesting that a garbage collection
+		// has not yet happened. In order to allow profiling when
+		// garbage collection is disabled from the beginning of execution,
+		// accumulate stats as if a GC just happened, and recount buckets.
+		mprof_GC()
+		mprof_GC()
+		n = 0
+		for b := mbuckets; b != nil; b = b.allnext {
+			mp := b.mp()
+			if inuseZero || mp.alloc_bytes != mp.free_bytes {
+				n++
+			}
+		}
+	}
+	if n <= len(p) {
+		ok = true
+		idx := 0
+		for b := mbuckets; b != nil; b = b.allnext {
+			mp := b.mp()
+			if inuseZero || mp.alloc_bytes != mp.free_bytes {
+				record(&p[idx], b)
+				idx++
+			}
+		}
+	}
+	unlock(&proflock)
+	return
+}
+
+// Write b's data to r.
+func record(r *MemProfileRecord, b *bucket) {
+	mp := b.mp()
+	r.AllocBytes = int64(mp.alloc_bytes)
+	r.FreeBytes = int64(mp.free_bytes)
+	r.AllocObjects = int64(mp.allocs)
+	r.FreeObjects = int64(mp.frees)
+	copy(r.Stack0[:], b.stk())
+	for i := int(b.nstk); i < len(r.Stack0); i++ {
+		r.Stack0[i] = 0
+	}
+}
+
+func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {
+	lock(&proflock)
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		fn(b, uintptr(b.nstk), &b.stk()[0], b.size, mp.allocs, mp.frees)
+	}
+	unlock(&proflock)
+}
+
+// BlockProfileRecord describes blocking events originated
+// at a particular call sequence (stack trace).
+type BlockProfileRecord struct {
+	Count  int64
+	Cycles int64
+	StackRecord
+}
+
+// BlockProfile returns n, the number of records in the current blocking profile.
+// If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
+// If len(p) < n, BlockProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.blockprofile flag instead
+// of calling BlockProfile directly.
+func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
+	lock(&proflock)
+	for b := bbuckets; b != nil; b = b.allnext {
+		n++
+	}
+	if n <= len(p) {
+		ok = true
+		for b := bbuckets; b != nil; b = b.allnext {
+			bp := b.bp()
+			r := &p[0]
+			r.Count = int64(bp.count)
+			r.Cycles = int64(bp.cycles)
+			i := copy(r.Stack0[:], b.stk())
+			for ; i < len(r.Stack0); i++ {
+				r.Stack0[i] = 0
+			}
+			p = p[1:]
+		}
+	}
+	unlock(&proflock)
+	return
+}
+
+// ThreadCreateProfile returns n, the number of records in the thread creation profile.
+// If len(p) >= n, ThreadCreateProfile copies the profile into p and returns n, true.
+// If len(p) < n, ThreadCreateProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package instead
+// of calling ThreadCreateProfile directly.
+func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
+	first := (*m)(atomicloadp(unsafe.Pointer(&allm)))
+	for mp := first; mp != nil; mp = mp.alllink {
+		n++
+	}
+	if n <= len(p) {
+		ok = true
+		i := 0
+		for mp := first; mp != nil; mp = mp.alllink {
+			for s := range mp.createstack {
+				p[i].Stack0[s] = uintptr(mp.createstack[s])
+			}
+			i++
+		}
+	}
+	return
+}
+
+var allgs []*g // proc.c
+
+// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
+// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
+// If len(p) < n, GoroutineProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package instead
+// of calling GoroutineProfile directly.
+func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+
+	n = NumGoroutine()
+	if n <= len(p) {
+		gp := getg()
+		semacquire(&worldsema, false)
+		gp.m.gcing = 1
+		onM(stoptheworld)
+
+		n = NumGoroutine()
+		if n <= len(p) {
+			ok = true
+			r := p
+			sp := getcallersp(unsafe.Pointer(&p))
+			pc := getcallerpc(unsafe.Pointer(&p))
+			onM(func() {
+				saveg(pc, sp, gp, &r[0])
+			})
+			r = r[1:]
+			for _, gp1 := range allgs {
+				if gp1 == gp || readgstatus(gp1) == _Gdead {
+					continue
+				}
+				saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
+				r = r[1:]
+			}
+		}
+
+		gp.m.gcing = 0
+		semrelease(&worldsema)
+		onM(starttheworld)
+	}
+
+	return n, ok
+}
+
+func saveg(pc, sp uintptr, gp *g, r *StackRecord) {
+	n := gentraceback(pc, sp, 0, gp, 0, &r.Stack0[0], len(r.Stack0), nil, nil, 0)
+	if n < len(r.Stack0) {
+		r.Stack0[n] = 0
+	}
+}
+
+// Stack formats a stack trace of the calling goroutine into buf
+// and returns the number of bytes written to buf.
+// If all is true, Stack formats stack traces of all other goroutines
+// into buf after the trace for the current goroutine.
+func Stack(buf []byte, all bool) int {
+	mp := acquirem()
+	gp := mp.curg
+	if all {
+		semacquire(&worldsema, false)
+		mp.gcing = 1
+		releasem(mp)
+		onM(stoptheworld)
+		if mp != acquirem() {
+			gothrow("Stack: rescheduled")
+		}
+	}
+
+	n := 0
+	if len(buf) > 0 {
+		sp := getcallersp(unsafe.Pointer(&buf))
+		pc := getcallerpc(unsafe.Pointer(&buf))
+		onM(func() {
+			g0 := getg()
+			g0.writebuf = buf[0:0:len(buf)]
+			goroutineheader(gp)
+			traceback(pc, sp, 0, gp)
+			if all {
+				tracebackothers(gp)
+			}
+			n = len(g0.writebuf)
+			g0.writebuf = nil
+		})
+	}
+
+	if all {
+		mp.gcing = 0
+		semrelease(&worldsema)
+		onM(starttheworld)
+	}
+	releasem(mp)
+	return n
+}
+
+// Tracing of alloc/free/gc.
+
+var tracelock mutex
+
+func tracealloc(p unsafe.Pointer, size uintptr, typ *_type) {
+	lock(&tracelock)
+	gp := getg()
+	gp.m.traceback = 2
+	if typ == nil {
+		print("tracealloc(", p, ", ", hex(size), ")\n")
+	} else {
+		print("tracealloc(", p, ", ", hex(size), ", ", *typ._string, ")\n")
+	}
+	if gp.m.curg == nil || gp == gp.m.curg {
+		goroutineheader(gp)
+		pc := getcallerpc(unsafe.Pointer(&p))
+		sp := getcallersp(unsafe.Pointer(&p))
+		onM(func() {
+			traceback(pc, sp, 0, gp)
+		})
+	} else {
+		goroutineheader(gp.m.curg)
+		traceback(^uintptr(0), ^uintptr(0), 0, gp.m.curg)
+	}
+	print("\n")
+	gp.m.traceback = 0
+	unlock(&tracelock)
+}
+
+func tracefree(p unsafe.Pointer, size uintptr) {
+	lock(&tracelock)
+	gp := getg()
+	gp.m.traceback = 2
+	print("tracefree(", p, ", ", hex(size), ")\n")
+	goroutineheader(gp)
+	pc := getcallerpc(unsafe.Pointer(&p))
+	sp := getcallersp(unsafe.Pointer(&p))
+	onM(func() {
+		traceback(pc, sp, 0, gp)
+	})
+	print("\n")
+	gp.m.traceback = 0
+	unlock(&tracelock)
+}
+
+func tracegc() {
+	lock(&tracelock)
+	gp := getg()
+	gp.m.traceback = 2
+	print("tracegc()\n")
+	// running on m->g0 stack; show all non-g0 goroutines
+	tracebackothers(gp)
+	print("end tracegc\n")
+	print("\n")
+	gp.m.traceback = 0
+	unlock(&tracelock)
+}
diff --git a/src/runtime/msize.c b/src/runtime/msize.c
new file mode 100644
index 000000000..7cb65dad0
--- /dev/null
+++ b/src/runtime/msize.c
@@ -0,0 +1,184 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Malloc small size classes.
+//
+// See malloc.h for overview.
+//
+// The size classes are chosen so that rounding an allocation
+// request up to the next size class wastes at most 12.5% (1.125x).
+//
+// Each size class has its own page count that gets allocated
+// and chopped up when new objects of the size class are needed.
+// That page count is chosen so that chopping up the run of
+// pages into objects of the given size wastes at most 12.5% (1.125x)
+// of the memory.  It is not necessary that the cutoff here be
+// the same as above.
+//
+// The two sources of waste multiply, so the worst possible case
+// for the above constraints would be that allocations of some
+// size might have a 26.6% (1.266x) overhead.
+// In practice, only one of the wastes comes into play for a
+// given size (sizes < 512 waste mainly on the round-up,
+// sizes > 512 waste mainly on the page chopping).
+//
+// TODO(rsc): Compute max waste for any given size.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "textflag.h"
+
+#pragma dataflag NOPTR
+int32 runtime·class_to_size[NumSizeClasses];
+#pragma dataflag NOPTR
+int32 runtime·class_to_allocnpages[NumSizeClasses];
+
+// The SizeToClass lookup is implemented using two arrays,
+// one mapping sizes <= 1024 to their class and one mapping
+// sizes >= 1024 and <= MaxSmallSize to their class.
+// All objects are 8-aligned, so the first array is indexed by
+// the size divided by 8 (rounded up).  Objects >= 1024 bytes
+// are 128-aligned, so the second array is indexed by the
+// size divided by 128 (rounded up).  The arrays are filled in
+// by InitSizes.
+
+#pragma dataflag NOPTR
+int8 runtime·size_to_class8[1024/8 + 1];
+#pragma dataflag NOPTR
+int8 runtime·size_to_class128[(MaxSmallSize-1024)/128 + 1];
+
+void runtime·testdefersizes(void);
+
+int32
+runtime·SizeToClass(int32 size)
+{
+	if(size > MaxSmallSize)
+		runtime·throw("SizeToClass - invalid size");
+	if(size > 1024-8)
+		return runtime·size_to_class128[(size-1024+127) >> 7];
+	return runtime·size_to_class8[(size+7)>>3];
+}
+
+void
+runtime·InitSizes(void)
+{
+	int32 align, sizeclass, size, nextsize, n;
+	uint32 i;
+	uintptr allocsize, npages;
+
+	// Initialize the runtime·class_to_size table (and choose class sizes in the process).
+	runtime·class_to_size[0] = 0;
+	sizeclass = 1;	// 0 means no class
+	align = 8;
+	for(size = align; size <= MaxSmallSize; size += align) {
+		if((size&(size-1)) == 0) {	// bump alignment once in a while
+			if(size >= 2048)
+				align = 256;
+			else if(size >= 128)
+				align = size / 8;
+			else if(size >= 16)
+				align = 16;	// required for x86 SSE instructions, if we want to use them
+		}
+		if((align&(align-1)) != 0)
+			runtime·throw("InitSizes - bug");
+
+		// Make the allocnpages big enough that
+		// the leftover is less than 1/8 of the total,
+		// so wasted space is at most 12.5%.
+		allocsize = PageSize;
+		while(allocsize%size > allocsize/8)
+			allocsize += PageSize;
+		npages = allocsize >> PageShift;
+
+		// If the previous sizeclass chose the same
+		// allocation size and fit the same number of
+		// objects into the page, we might as well
+		// use just this size instead of having two
+		// different sizes.
+		if(sizeclass > 1 &&
+			npages == runtime·class_to_allocnpages[sizeclass-1] &&
+			allocsize/size == allocsize/runtime·class_to_size[sizeclass-1]) {
+			runtime·class_to_size[sizeclass-1] = size;
+			continue;
+		}
+
+		runtime·class_to_allocnpages[sizeclass] = npages;
+		runtime·class_to_size[sizeclass] = size;
+		sizeclass++;
+	}
+	if(sizeclass != NumSizeClasses) {
+		runtime·printf("sizeclass=%d NumSizeClasses=%d\n", sizeclass, NumSizeClasses);
+		runtime·throw("InitSizes - bad NumSizeClasses");
+	}
+
+	// Initialize the size_to_class tables.
+	nextsize = 0;
+	for (sizeclass = 1; sizeclass < NumSizeClasses; sizeclass++) {
+		for(; nextsize < 1024 && nextsize <= runtime·class_to_size[sizeclass]; nextsize+=8)
+			runtime·size_to_class8[nextsize/8] = sizeclass;
+		if(nextsize >= 1024)
+			for(; nextsize <= runtime·class_to_size[sizeclass]; nextsize += 128)
+				runtime·size_to_class128[(nextsize-1024)/128] = sizeclass;
+	}
+
+	// Double-check SizeToClass.
+	if(0) {
+		for(n=0; n < MaxSmallSize; n++) {
+			sizeclass = runtime·SizeToClass(n);
+			if(sizeclass < 1 || sizeclass >= NumSizeClasses || runtime·class_to_size[sizeclass] < n) {
+				runtime·printf("size=%d sizeclass=%d runtime·class_to_size=%d\n", n, sizeclass, runtime·class_to_size[sizeclass]);
+				runtime·printf("incorrect SizeToClass");
+				goto dump;
+			}
+			if(sizeclass > 1 && runtime·class_to_size[sizeclass-1] >= n) {
+				runtime·printf("size=%d sizeclass=%d runtime·class_to_size=%d\n", n, sizeclass, runtime·class_to_size[sizeclass]);
+				runtime·printf("SizeToClass too big");
+				goto dump;
+			}
+		}
+	}
+
+	runtime·testdefersizes();
+
+	// Copy out for statistics table.
+	for(i=0; i<nelem(runtime·class_to_size); i++)
+		mstats.by_size[i].size = runtime·class_to_size[i];
+	return;
+
+dump:
+	if(1){
+		runtime·printf("NumSizeClasses=%d\n", NumSizeClasses);
+		runtime·printf("runtime·class_to_size:");
+		for(sizeclass=0; sizeclass<NumSizeClasses; sizeclass++)
+			runtime·printf(" %d", runtime·class_to_size[sizeclass]);
+		runtime·printf("\n\n");
+		runtime·printf("size_to_class8:");
+		for(i=0; i<nelem(runtime·size_to_class8); i++)
+			runtime·printf(" %d=>%d(%d)\n", i*8, runtime·size_to_class8[i],
+				runtime·class_to_size[runtime·size_to_class8[i]]);
+		runtime·printf("\n");
+		runtime·printf("size_to_class128:");
+		for(i=0; i<nelem(runtime·size_to_class128); i++)
+			runtime·printf(" %d=>%d(%d)\n", i*128, runtime·size_to_class128[i],
+				runtime·class_to_size[runtime·size_to_class128[i]]);
+		runtime·printf("\n");
+	}
+	runtime·throw("InitSizes failed");
+}
+
+// Returns size of the memory block that mallocgc will allocate if you ask for the size.
+uintptr
+runtime·roundupsize(uintptr size)
+{
+	if(size < MaxSmallSize) {
+		if(size <= 1024-8)
+			return runtime·class_to_size[runtime·size_to_class8[(size+7)>>3]];
+		else
+			return runtime·class_to_size[runtime·size_to_class128[(size-1024+127) >> 7]];
+	}
+	if(size + PageSize < size)
+		return size;
+	return ROUND(size, PageSize);
+}
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
new file mode 100644
index 000000000..3456e0208
--- /dev/null
+++ b/src/runtime/netpoll.go
@@ -0,0 +1,455 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris windows
+
+package runtime
+
+import "unsafe"
+
+// Integrated network poller (platform-independent part).
+// A particular implementation (epoll/kqueue) must define the following functions:
+// func netpollinit()			// to initialize the poller
+// func netpollopen(fd uintptr, pd *pollDesc) int32	// to arm edge-triggered notifications
+// and associate fd with pd.
+// An implementation must call the following function to denote that the pd is ready.
+// func netpollready(gpp **g, pd *pollDesc, mode int32)
+
+// pollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
+// goroutines respectively. The semaphore can be in the following states:
+// pdReady - io readiness notification is pending;
+//           a goroutine consumes the notification by changing the state to nil.
+// pdWait - a goroutine prepares to park on the semaphore, but not yet parked;
+//          the goroutine commits to park by changing the state to G pointer,
+//          or, alternatively, concurrent io notification changes the state to READY,
+//          or, alternatively, concurrent timeout/close changes the state to nil.
+// G pointer - the goroutine is blocked on the semaphore;
+//             io notification or timeout/close changes the state to READY or nil respectively
+//             and unparks the goroutine.
+// nil - nothing of the above.
+const (
+	pdReady uintptr = 1
+	pdWait  uintptr = 2
+)
+
+const pollBlockSize = 4 * 1024
+
+// Network poller descriptor.
+type pollDesc struct {
+	link *pollDesc // in pollcache, protected by pollcache.lock
+
+	// The lock protects pollOpen, pollSetDeadline, pollUnblock and deadlineimpl operations.
+	// This fully covers seq, rt and wt variables. fd is constant throughout the PollDesc lifetime.
+	// pollReset, pollWait, pollWaitCanceled and runtime·netpollready (IO readiness notification)
+	// proceed w/o taking the lock. So closing, rg, rd, wg and wd are manipulated
+	// in a lock-free way by all operations.
+	// NOTE(dvyukov): the following code uses uintptr to store *g (rg/wg),
+	// that will blow up when GC starts moving objects.
+	lock    mutex // protectes the following fields
+	fd      uintptr
+	closing bool
+	seq     uintptr        // protects from stale timers and ready notifications
+	rg      uintptr        // pdReady, pdWait, G waiting for read or nil
+	rt      timer          // read deadline timer (set if rt.f != nil)
+	rd      int64          // read deadline
+	wg      uintptr        // pdReady, pdWait, G waiting for write or nil
+	wt      timer          // write deadline timer
+	wd      int64          // write deadline
+	user    unsafe.Pointer // user settable cookie
+}
+
+type pollCache struct {
+	lock  mutex
+	first *pollDesc
+	// PollDesc objects must be type-stable,
+	// because we can get ready notification from epoll/kqueue
+	// after the descriptor is closed/reused.
+	// Stale notifications are detected using seq variable,
+	// seq is incremented when deadlines are changed or descriptor is reused.
+}
+
+var pollcache pollCache
+
+func netpollServerInit() {
+	onM(netpollinit)
+}
+
+func netpollOpen(fd uintptr) (*pollDesc, int) {
+	pd := pollcache.alloc()
+	lock(&pd.lock)
+	if pd.wg != 0 && pd.wg != pdReady {
+		gothrow("netpollOpen: blocked write on free descriptor")
+	}
+	if pd.rg != 0 && pd.rg != pdReady {
+		gothrow("netpollOpen: blocked read on free descriptor")
+	}
+	pd.fd = fd
+	pd.closing = false
+	pd.seq++
+	pd.rg = 0
+	pd.rd = 0
+	pd.wg = 0
+	pd.wd = 0
+	unlock(&pd.lock)
+
+	var errno int32
+	onM(func() {
+		errno = netpollopen(fd, pd)
+	})
+	return pd, int(errno)
+}
+
+func netpollClose(pd *pollDesc) {
+	if !pd.closing {
+		gothrow("netpollClose: close w/o unblock")
+	}
+	if pd.wg != 0 && pd.wg != pdReady {
+		gothrow("netpollClose: blocked write on closing descriptor")
+	}
+	if pd.rg != 0 && pd.rg != pdReady {
+		gothrow("netpollClose: blocked read on closing descriptor")
+	}
+	onM(func() {
+		netpollclose(uintptr(pd.fd))
+	})
+	pollcache.free(pd)
+}
+
+func (c *pollCache) free(pd *pollDesc) {
+	lock(&c.lock)
+	pd.link = c.first
+	c.first = pd
+	unlock(&c.lock)
+}
+
+func netpollReset(pd *pollDesc, mode int) int {
+	err := netpollcheckerr(pd, int32(mode))
+	if err != 0 {
+		return err
+	}
+	if mode == 'r' {
+		pd.rg = 0
+	} else if mode == 'w' {
+		pd.wg = 0
+	}
+	return 0
+}
+
+func netpollWait(pd *pollDesc, mode int) int {
+	err := netpollcheckerr(pd, int32(mode))
+	if err != 0 {
+		return err
+	}
+	// As for now only Solaris uses level-triggered IO.
+	if GOOS == "solaris" {
+		onM(func() {
+			netpollarm(pd, mode)
+		})
+	}
+	for !netpollblock(pd, int32(mode), false) {
+		err = netpollcheckerr(pd, int32(mode))
+		if err != 0 {
+			return err
+		}
+		// Can happen if timeout has fired and unblocked us,
+		// but before we had a chance to run, timeout has been reset.
+		// Pretend it has not happened and retry.
+	}
+	return 0
+}
+
+func netpollWaitCanceled(pd *pollDesc, mode int) {
+	// This function is used only on windows after a failed attempt to cancel
+	// a pending async IO operation. Wait for ioready, ignore closing or timeouts.
+	for !netpollblock(pd, int32(mode), true) {
+	}
+}
+
+func netpollSetDeadline(pd *pollDesc, d int64, mode int) {
+	lock(&pd.lock)
+	if pd.closing {
+		unlock(&pd.lock)
+		return
+	}
+	pd.seq++ // invalidate current timers
+	// Reset current timers.
+	if pd.rt.f != nil {
+		deltimer(&pd.rt)
+		pd.rt.f = nil
+	}
+	if pd.wt.f != nil {
+		deltimer(&pd.wt)
+		pd.wt.f = nil
+	}
+	// Setup new timers.
+	if d != 0 && d <= nanotime() {
+		d = -1
+	}
+	if mode == 'r' || mode == 'r'+'w' {
+		pd.rd = d
+	}
+	if mode == 'w' || mode == 'r'+'w' {
+		pd.wd = d
+	}
+	if pd.rd > 0 && pd.rd == pd.wd {
+		pd.rt.f = netpollDeadline
+		pd.rt.when = pd.rd
+		// Copy current seq into the timer arg.
+		// Timer func will check the seq against current descriptor seq,
+		// if they differ the descriptor was reused or timers were reset.
+		pd.rt.arg = pd
+		pd.rt.seq = pd.seq
+		addtimer(&pd.rt)
+	} else {
+		if pd.rd > 0 {
+			pd.rt.f = netpollReadDeadline
+			pd.rt.when = pd.rd
+			pd.rt.arg = pd
+			pd.rt.seq = pd.seq
+			addtimer(&pd.rt)
+		}
+		if pd.wd > 0 {
+			pd.wt.f = netpollWriteDeadline
+			pd.wt.when = pd.wd
+			pd.wt.arg = pd
+			pd.wt.seq = pd.seq
+			addtimer(&pd.wt)
+		}
+	}
+	// If we set the new deadline in the past, unblock currently pending IO if any.
+	var rg, wg *g
+	atomicstorep(unsafe.Pointer(&wg), nil) // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
+	if pd.rd < 0 {
+		rg = netpollunblock(pd, 'r', false)
+	}
+	if pd.wd < 0 {
+		wg = netpollunblock(pd, 'w', false)
+	}
+	unlock(&pd.lock)
+	if rg != nil {
+		goready(rg)
+	}
+	if wg != nil {
+		goready(wg)
+	}
+}
+
+func netpollUnblock(pd *pollDesc) {
+	lock(&pd.lock)
+	if pd.closing {
+		gothrow("netpollUnblock: already closing")
+	}
+	pd.closing = true
+	pd.seq++
+	var rg, wg *g
+	atomicstorep(unsafe.Pointer(&rg), nil) // full memory barrier between store to closing and read of rg/wg in netpollunblock
+	rg = netpollunblock(pd, 'r', false)
+	wg = netpollunblock(pd, 'w', false)
+	if pd.rt.f != nil {
+		deltimer(&pd.rt)
+		pd.rt.f = nil
+	}
+	if pd.wt.f != nil {
+		deltimer(&pd.wt)
+		pd.wt.f = nil
+	}
+	unlock(&pd.lock)
+	if rg != nil {
+		goready(rg)
+	}
+	if wg != nil {
+		goready(wg)
+	}
+}
+
+func netpollfd(pd *pollDesc) uintptr {
+	return pd.fd
+}
+
+func netpolluser(pd *pollDesc) *unsafe.Pointer {
+	return &pd.user
+}
+
+func netpollclosing(pd *pollDesc) bool {
+	return pd.closing
+}
+
+func netpolllock(pd *pollDesc) {
+	lock(&pd.lock)
+}
+
+func netpollunlock(pd *pollDesc) {
+	unlock(&pd.lock)
+}
+
+// make pd ready, newly runnable goroutines (if any) are returned in rg/wg
+func netpollready(gpp **g, pd *pollDesc, mode int32) {
+	var rg, wg *g
+	if mode == 'r' || mode == 'r'+'w' {
+		rg = netpollunblock(pd, 'r', true)
+	}
+	if mode == 'w' || mode == 'r'+'w' {
+		wg = netpollunblock(pd, 'w', true)
+	}
+	if rg != nil {
+		rg.schedlink = *gpp
+		*gpp = rg
+	}
+	if wg != nil {
+		wg.schedlink = *gpp
+		*gpp = wg
+	}
+}
+
+func netpollcheckerr(pd *pollDesc, mode int32) int {
+	if pd.closing {
+		return 1 // errClosing
+	}
+	if (mode == 'r' && pd.rd < 0) || (mode == 'w' && pd.wd < 0) {
+		return 2 // errTimeout
+	}
+	return 0
+}
+
+func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
+	return casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+}
+
+// returns true if IO is ready, or false if timedout or closed
+// waitio - wait only for completed IO, ignore errors
+func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
+	gpp := &pd.rg
+	if mode == 'w' {
+		gpp = &pd.wg
+	}
+
+	// set the gpp semaphore to WAIT
+	for {
+		old := *gpp
+		if old == pdReady {
+			*gpp = 0
+			return true
+		}
+		if old != 0 {
+			gothrow("netpollblock: double wait")
+		}
+		if casuintptr(gpp, 0, pdWait) {
+			break
+		}
+	}
+
+	// need to recheck error states after setting gpp to WAIT
+	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
+	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
+	if waitio || netpollcheckerr(pd, mode) == 0 {
+		f := netpollblockcommit
+		gopark(**(**unsafe.Pointer)(unsafe.Pointer(&f)), unsafe.Pointer(gpp), "IO wait")
+	}
+	// be careful to not lose concurrent READY notification
+	old := xchguintptr(gpp, 0)
+	if old > pdWait {
+		gothrow("netpollblock: corrupted state")
+	}
+	return old == pdReady
+}
+
+func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
+	gpp := &pd.rg
+	if mode == 'w' {
+		gpp = &pd.wg
+	}
+
+	for {
+		old := *gpp
+		if old == pdReady {
+			return nil
+		}
+		if old == 0 && !ioready {
+			// Only set READY for ioready. runtime_pollWait
+			// will check for timeout/cancel before waiting.
+			return nil
+		}
+		var new uintptr
+		if ioready {
+			new = pdReady
+		}
+		if casuintptr(gpp, old, new) {
+			if old == pdReady || old == pdWait {
+				old = 0
+			}
+			return (*g)(unsafe.Pointer(old))
+		}
+	}
+}
+
+func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
+	lock(&pd.lock)
+	// Seq arg is seq when the timer was set.
+	// If it's stale, ignore the timer event.
+	if seq != pd.seq {
+		// The descriptor was reused or timers were reset.
+		unlock(&pd.lock)
+		return
+	}
+	var rg *g
+	if read {
+		if pd.rd <= 0 || pd.rt.f == nil {
+			gothrow("netpolldeadlineimpl: inconsistent read deadline")
+		}
+		pd.rd = -1
+		atomicstorep(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
+		rg = netpollunblock(pd, 'r', false)
+	}
+	var wg *g
+	if write {
+		if pd.wd <= 0 || pd.wt.f == nil && !read {
+			gothrow("netpolldeadlineimpl: inconsistent write deadline")
+		}
+		pd.wd = -1
+		atomicstorep(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
+		wg = netpollunblock(pd, 'w', false)
+	}
+	unlock(&pd.lock)
+	if rg != nil {
+		goready(rg)
+	}
+	if wg != nil {
+		goready(wg)
+	}
+}
+
+func netpollDeadline(arg interface{}, seq uintptr) {
+	netpolldeadlineimpl(arg.(*pollDesc), seq, true, true)
+}
+
+func netpollReadDeadline(arg interface{}, seq uintptr) {
+	netpolldeadlineimpl(arg.(*pollDesc), seq, true, false)
+}
+
+func netpollWriteDeadline(arg interface{}, seq uintptr) {
+	netpolldeadlineimpl(arg.(*pollDesc), seq, false, true)
+}
+
+func (c *pollCache) alloc() *pollDesc {
+	lock(&c.lock)
+	if c.first == nil {
+		const pdSize = unsafe.Sizeof(pollDesc{})
+		n := pollBlockSize / pdSize
+		if n == 0 {
+			n = 1
+		}
+		// Must be in non-GC memory because can be referenced
+		// only from epoll/kqueue internals.
+		mem := persistentalloc(n*pdSize, 0, &memstats.other_sys)
+		for i := uintptr(0); i < n; i++ {
+			pd := (*pollDesc)(add(mem, i*pdSize))
+			pd.link = c.first
+			c.first = pd
+		}
+	}
+	pd := c.first
+	c.first = pd.link
+	unlock(&c.lock)
+	return pd
+}
diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
new file mode 100644
index 000000000..ecfc9cdde
--- /dev/null
+++ b/src/runtime/netpoll_epoll.go
@@ -0,0 +1,97 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package runtime
+
+import "unsafe"
+
+func epollcreate(size int32) int32
+func epollcreate1(flags int32) int32
+
+//go:noescape
+func epollctl(epfd, op, fd int32, ev *epollevent) int32
+
+//go:noescape
+func epollwait(epfd int32, ev *epollevent, nev, timeout int32) int32
+func closeonexec(fd int32)
+
+var (
+	epfd           int32 = -1 // epoll descriptor
+	netpolllasterr int32
+)
+
+func netpollinit() {
+	epfd = epollcreate1(_EPOLL_CLOEXEC)
+	if epfd >= 0 {
+		return
+	}
+	epfd = epollcreate(1024)
+	if epfd >= 0 {
+		closeonexec(epfd)
+		return
+	}
+	println("netpollinit: failed to create epoll descriptor", -epfd)
+	gothrow("netpollinit: failed to create descriptor")
+}
+
+func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	var ev epollevent
+	ev.events = _EPOLLIN | _EPOLLOUT | _EPOLLRDHUP | _EPOLLET
+	*(**pollDesc)(unsafe.Pointer(&ev.data)) = pd
+	return -epollctl(epfd, _EPOLL_CTL_ADD, int32(fd), &ev)
+}
+
+func netpollclose(fd uintptr) int32 {
+	var ev epollevent
+	return -epollctl(epfd, _EPOLL_CTL_DEL, int32(fd), &ev)
+}
+
+func netpollarm(pd *pollDesc, mode int) {
+	gothrow("unused")
+}
+
+// polls for ready network connections
+// returns list of goroutines that become runnable
+func netpoll(block bool) (gp *g) {
+	if epfd == -1 {
+		return
+	}
+	waitms := int32(-1)
+	if !block {
+		waitms = 0
+	}
+	var events [128]epollevent
+retry:
+	n := epollwait(epfd, &events[0], int32(len(events)), waitms)
+	if n < 0 {
+		if n != -_EINTR && n != netpolllasterr {
+			netpolllasterr = n
+			println("runtime: epollwait on fd", epfd, "failed with", -n)
+		}
+		goto retry
+	}
+	for i := int32(0); i < n; i++ {
+		ev := &events[i]
+		if ev.events == 0 {
+			continue
+		}
+		var mode int32
+		if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
+			mode += 'r'
+		}
+		if ev.events&(_EPOLLOUT|_EPOLLHUP|_EPOLLERR) != 0 {
+			mode += 'w'
+		}
+		if mode != 0 {
+			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
+			netpollready((**g)(noescape(unsafe.Pointer(&gp))), pd, mode)
+		}
+	}
+	if block && gp == nil {
+		goto retry
+	}
+	return gp
+}
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
new file mode 100644
index 000000000..d6d55b97b
--- /dev/null
+++ b/src/runtime/netpoll_kqueue.go
@@ -0,0 +1,101 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd netbsd openbsd
+
+package runtime
+
+// Integrated network poller (kqueue-based implementation).
+
+import "unsafe"
+
+func kqueue() int32
+
+//go:noescape
+func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32
+func closeonexec(fd int32)
+
+var (
+	kq             int32 = -1
+	netpolllasterr int32
+)
+
+func netpollinit() {
+	kq = kqueue()
+	if kq < 0 {
+		println("netpollinit: kqueue failed with", -kq)
+		gothrow("netpollinit: kqueue failed")
+	}
+	closeonexec(kq)
+}
+
+func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
+	// for the whole fd lifetime.  The notifications are automatically unregistered
+	// when fd is closed.
+	var ev [2]keventt
+	*(*uintptr)(unsafe.Pointer(&ev[0].ident)) = fd
+	ev[0].filter = _EVFILT_READ
+	ev[0].flags = _EV_ADD | _EV_CLEAR
+	ev[0].fflags = 0
+	ev[0].data = 0
+	ev[0].udata = (*byte)(unsafe.Pointer(pd))
+	ev[1] = ev[0]
+	ev[1].filter = _EVFILT_WRITE
+	n := kevent(kq, &ev[0], 2, nil, 0, nil)
+	if n < 0 {
+		return -n
+	}
+	return 0
+}
+
+func netpollclose(fd uintptr) int32 {
+	// Don't need to unregister because calling close()
+	// on fd will remove any kevents that reference the descriptor.
+	return 0
+}
+
+func netpollarm(pd *pollDesc, mode int) {
+	gothrow("unused")
+}
+
+// Polls for ready network connections.
+// Returns list of goroutines that become runnable.
+func netpoll(block bool) (gp *g) {
+	if kq == -1 {
+		return
+	}
+	var tp *timespec
+	var ts timespec
+	if !block {
+		tp = &ts
+	}
+	var events [64]keventt
+retry:
+	n := kevent(kq, nil, 0, &events[0], int32(len(events)), tp)
+	if n < 0 {
+		if n != -_EINTR && n != netpolllasterr {
+			netpolllasterr = n
+			println("runtime: kevent on fd", kq, "failed with", -n)
+		}
+		goto retry
+	}
+	for i := 0; i < int(n); i++ {
+		ev := &events[i]
+		var mode int32
+		if ev.filter == _EVFILT_READ {
+			mode += 'r'
+		}
+		if ev.filter == _EVFILT_WRITE {
+			mode += 'w'
+		}
+		if mode != 0 {
+			netpollready((**g)(noescape(unsafe.Pointer(&gp))), (*pollDesc)(unsafe.Pointer(ev.udata)), mode)
+		}
+	}
+	if block && gp == nil {
+		goto retry
+	}
+	return gp
+}
diff --git a/src/runtime/netpoll_nacl.go b/src/runtime/netpoll_nacl.go
new file mode 100644
index 000000000..5cbc30032
--- /dev/null
+++ b/src/runtime/netpoll_nacl.go
@@ -0,0 +1,26 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Fake network poller for NaCl.
+// Should never be used, because NaCl network connections do not honor "SetNonblock".
+
+package runtime
+
+func netpollinit() {
+}
+
+func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	return 0
+}
+
+func netpollclose(fd uintptr) int32 {
+	return 0
+}
+
+func netpollarm(pd *pollDesc, mode int) {
+}
+
+func netpoll(block bool) *g {
+	return nil
+}
diff --git a/src/runtime/netpoll_solaris.c b/src/runtime/netpoll_solaris.c
new file mode 100644
index 000000000..d422719cf
--- /dev/null
+++ b/src/runtime/netpoll_solaris.c
@@ -0,0 +1,264 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+// Solaris runtime-integrated network poller.
+// 
+// Solaris uses event ports for scalable network I/O. Event
+// ports are level-triggered, unlike epoll and kqueue which
+// can be configured in both level-triggered and edge-triggered
+// mode. Level triggering means we have to keep track of a few things
+// ourselves. After we receive an event for a file descriptor,
+// it's our responsibility to ask again to be notified for future
+// events for that descriptor. When doing this we must keep track of
+// what kind of events the goroutines are currently interested in,
+// for example a fd may be open both for reading and writing.
+// 
+// A description of the high level operation of this code
+// follows. Networking code will get a file descriptor by some means
+// and will register it with the netpolling mechanism by a code path
+// that eventually calls runtime·netpollopen. runtime·netpollopen
+// calls port_associate with an empty event set. That means that we
+// will not receive any events at this point. The association needs
+// to be done at this early point because we need to process the I/O
+// readiness notification at some point in the future. If I/O becomes
+// ready when nobody is listening, when we finally care about it,
+// nobody will tell us anymore.
+// 
+// Beside calling runtime·netpollopen, the networking code paths
+// will call runtime·netpollarm each time goroutines are interested
+// in doing network I/O. Because now we know what kind of I/O we
+// are interested in (reading/writting), we can call port_associate
+// passing the correct type of event set (POLLIN/POLLOUT). As we made
+// sure to have already associated the file descriptor with the port,
+// when we now call port_associate, we will unblock the main poller
+// loop (in runtime·netpoll) right away if the socket is actually
+// ready for I/O.
+// 
+// The main poller loop runs in its own thread waiting for events
+// using port_getn. When an event happens, it will tell the scheduler
+// about it using runtime·netpollready. Besides doing this, it must
+// also re-associate the events that were not part of this current
+// notification with the file descriptor. Failing to do this would
+// mean each notification will prevent concurrent code using the
+// same file descriptor in parallel.
+// 
+// The logic dealing with re-associations is encapsulated in
+// runtime·netpollupdate. This function takes care to associate the
+// descriptor only with the subset of events that were previously
+// part of the association, except the one that just happened. We
+// can't re-associate with that right away, because event ports
+// are level triggered so it would cause a busy loop. Instead, that
+// association is effected only by the runtime·netpollarm code path,
+// when Go code actually asks for I/O.
+// 
+// The open and arming mechanisms are serialized using the lock
+// inside PollDesc. This is required because the netpoll loop runs
+// asynchonously in respect to other Go code and by the time we get
+// to call port_associate to update the association in the loop, the
+// file descriptor might have been closed and reopened already. The
+// lock allows runtime·netpollupdate to be called synchronously from
+// the loop thread while preventing other threads operating to the
+// same PollDesc, so once we unblock in the main loop, until we loop
+// again we know for sure we are always talking about the same file
+// descriptor and can safely access the data we want (the event set).
+
+#pragma dynimport libc·fcntl fcntl "libc.so"
+#pragma dynimport libc·port_create port_create "libc.so"
+#pragma dynimport libc·port_associate port_associate "libc.so"
+#pragma dynimport libc·port_dissociate port_dissociate "libc.so"
+#pragma dynimport libc·port_getn port_getn "libc.so"
+extern uintptr libc·fcntl;
+extern uintptr libc·port_create;
+extern uintptr libc·port_associate;
+extern uintptr libc·port_dissociate;
+extern uintptr libc·port_getn;
+
+#define errno (*g->m->perrno)
+
+int32
+runtime·fcntl(int32 fd, int32 cmd, uintptr arg)
+{
+	return runtime·sysvicall3(libc·fcntl, (uintptr)fd, (uintptr)cmd, (uintptr)arg);
+}
+
+int32
+runtime·port_create(void)
+{
+	return runtime·sysvicall0(libc·port_create);
+}
+
+int32
+runtime·port_associate(int32 port, int32 source, uintptr object, int32 events, uintptr user)
+{
+	return runtime·sysvicall5(libc·port_associate, (uintptr)port, (uintptr)source, object, (uintptr)events, user);
+}
+
+int32
+runtime·port_dissociate(int32 port, int32 source, uintptr object)
+{
+	return runtime·sysvicall3(libc·port_dissociate, (uintptr)port, (uintptr)source, object);
+}
+
+int32
+runtime·port_getn(int32 port, PortEvent *evs, uint32 max, uint32 *nget, Timespec *timeout)
+{
+	return runtime·sysvicall5(libc·port_getn, (uintptr)port, (uintptr)evs, (uintptr)max, (uintptr)nget, (uintptr)timeout);
+}
+
+static int32 portfd = -1;
+
+void
+runtime·netpollinit(void)
+{
+	if((portfd = runtime·port_create()) >= 0) {
+		runtime·fcntl(portfd, F_SETFD, FD_CLOEXEC);
+		return;
+	}
+
+	runtime·printf("netpollinit: failed to create port (%d)\n", errno);
+	runtime·throw("netpollinit: failed to create port");
+}
+
+int32
+runtime·netpollopen(uintptr fd, PollDesc *pd)
+{
+	int32 r;
+
+	runtime·netpolllock(pd);
+	// We don't register for any specific type of events yet, that's
+	// netpollarm's job. We merely ensure we call port_associate before
+	// asynchonous connect/accept completes, so when we actually want
+	// to do any I/O, the call to port_associate (from netpollarm,
+	// with the interested event set) will unblock port_getn right away
+	// because of the I/O readiness notification.
+	*runtime·netpolluser(pd) = 0;
+	r = runtime·port_associate(portfd, PORT_SOURCE_FD, fd, 0, (uintptr)pd);
+	runtime·netpollunlock(pd);
+	return r;
+}
+
+int32
+runtime·netpollclose(uintptr fd)
+{
+	return runtime·port_dissociate(portfd, PORT_SOURCE_FD, fd);
+}
+
+// Updates the association with a new set of interested events. After
+// this call, port_getn will return one and only one event for that
+// particular descriptor, so this function needs to be called again.
+void
+runtime·netpollupdate(PollDesc* pd, uint32 set, uint32 clear)
+{
+	uint32 *ep, old, events;
+	uintptr fd = runtime·netpollfd(pd);
+	ep = (uint32*)runtime·netpolluser(pd);
+
+	if(runtime·netpollclosing(pd))
+		return;
+
+	old = *ep;
+	events = (old & ~clear) | set;
+	if(old == events)
+		return;
+
+	if(events && runtime·port_associate(portfd, PORT_SOURCE_FD, fd, events, (uintptr)pd) != 0) {
+		runtime·printf("netpollupdate: failed to associate (%d)\n", errno);
+		runtime·throw("netpollupdate: failed to associate");
+	} 
+	*ep = events;
+}
+
+// subscribe the fd to the port such that port_getn will return one event.
+void
+runtime·netpollarm(PollDesc* pd, int32 mode)
+{
+	runtime·netpolllock(pd);
+	switch(mode) {
+	case 'r':
+		runtime·netpollupdate(pd, POLLIN, 0);
+		break;
+	case 'w':
+		runtime·netpollupdate(pd, POLLOUT, 0);
+		break;
+	default:
+		runtime·throw("netpollarm: bad mode");
+	}
+	runtime·netpollunlock(pd);
+}
+
+// polls for ready network connections
+// returns list of goroutines that become runnable
+G*
+runtime·netpoll(bool block)
+{
+	static int32 lasterr;
+	PortEvent events[128], *ev;
+	PollDesc *pd;
+	int32 i, mode, clear;
+	uint32 n;
+	Timespec *wait = nil, zero;
+	G *gp;
+
+	if(portfd == -1)
+		return (nil);
+
+	if(!block) {
+		zero.tv_sec = 0;
+		zero.tv_nsec = 0;
+		wait = &zero;
+	}
+
+retry:
+	n = 1;
+	if(runtime·port_getn(portfd, events, nelem(events), &n, wait) < 0) {
+		if(errno != EINTR && errno != lasterr) {
+			lasterr = errno;
+			runtime·printf("runtime: port_getn on fd %d failed with %d\n", portfd, errno);
+		}
+		goto retry;
+	}
+
+	gp = nil;
+	for(i = 0; i < n; i++) {
+		ev = &events[i];
+
+		if(ev->portev_events == 0)
+			continue;
+		pd = (PollDesc *)ev->portev_user;
+
+		mode = 0;
+		clear = 0;
+		if(ev->portev_events & (POLLIN|POLLHUP|POLLERR)) {
+			mode += 'r';
+			clear |= POLLIN;
+		}
+		if(ev->portev_events & (POLLOUT|POLLHUP|POLLERR)) {
+			mode += 'w';
+			clear |= POLLOUT;
+		}
+		// To effect edge-triggered events, we need to be sure to
+		// update our association with whatever events were not
+		// set with the event. For example if we are registered
+		// for POLLIN|POLLOUT, and we get POLLIN, besides waking
+		// the goroutine interested in POLLIN we have to not forget
+		// about the one interested in POLLOUT.
+		if(clear != 0) {
+			runtime·netpolllock(pd);
+			runtime·netpollupdate(pd, 0, clear);
+			runtime·netpollunlock(pd);
+		}
+
+		if(mode)
+			runtime·netpollready(&gp, pd, mode);
+	}
+
+	if(block && gp == nil)
+		goto retry;
+	return gp;
+}
diff --git a/src/runtime/netpoll_stub.c b/src/runtime/netpoll_stub.c
new file mode 100644
index 000000000..b7a8f2944
--- /dev/null
+++ b/src/runtime/netpoll_stub.c
@@ -0,0 +1,18 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build plan9
+
+#include "runtime.h"
+
+// Polls for ready network connections.
+// Returns list of goroutines that become runnable.
+G*
+runtime·netpoll(bool block)
+{
+	// Implementation for platforms that do not support
+	// integrated network poller.
+	USED(block);
+	return nil;
+}
diff --git a/src/runtime/netpoll_windows.c b/src/runtime/netpoll_windows.c
new file mode 100644
index 000000000..64da41ad9
--- /dev/null
+++ b/src/runtime/netpoll_windows.c
@@ -0,0 +1,163 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+#define DWORD_MAX 0xffffffff
+
+#pragma dynimport runtime·CreateIoCompletionPort CreateIoCompletionPort "kernel32.dll"
+#pragma dynimport runtime·GetQueuedCompletionStatus GetQueuedCompletionStatus "kernel32.dll"
+#pragma dynimport runtime·WSAGetOverlappedResult WSAGetOverlappedResult "ws2_32.dll"
+
+extern void *runtime·CreateIoCompletionPort;
+extern void *runtime·GetQueuedCompletionStatus;
+extern void *runtime·WSAGetOverlappedResult;
+
+#define INVALID_HANDLE_VALUE ((uintptr)-1)
+
+// net_op must be the same as beginning of net.operation. Keep these in sync.
+typedef struct net_op net_op;
+struct net_op
+{
+	// used by windows
+	Overlapped	o;
+	// used by netpoll
+	PollDesc*	pd;
+	int32	mode;
+	int32	errno;
+	uint32	qty;
+};
+
+typedef struct OverlappedEntry OverlappedEntry;
+struct OverlappedEntry
+{
+	uintptr	key;
+	net_op*	op;  // In reality it's Overlapped*, but we cast it to net_op* anyway.
+	uintptr	internal;
+	uint32	qty;
+};
+
+static void handlecompletion(G **gpp, net_op *o, int32 errno, uint32 qty);
+
+static uintptr iocphandle = INVALID_HANDLE_VALUE;  // completion port io handle
+
+void
+runtime·netpollinit(void)
+{
+	iocphandle = (uintptr)runtime·stdcall4(runtime·CreateIoCompletionPort, INVALID_HANDLE_VALUE, 0, 0, DWORD_MAX);
+	if(iocphandle == 0) {
+		runtime·printf("netpoll: failed to create iocp handle (errno=%d)\n", runtime·getlasterror());
+		runtime·throw("netpoll: failed to create iocp handle");
+	}
+	return;
+}
+
+int32
+runtime·netpollopen(uintptr fd, PollDesc *pd)
+{
+	USED(pd);
+	if(runtime·stdcall4(runtime·CreateIoCompletionPort, fd, iocphandle, 0, 0) == 0)
+		return -runtime·getlasterror();
+	return 0;
+}
+
+int32
+runtime·netpollclose(uintptr fd)
+{
+	// nothing to do
+	USED(fd);
+	return 0;
+}
+
+void
+runtime·netpollarm(PollDesc* pd, int32 mode)
+{
+	USED(pd, mode);
+	runtime·throw("unused");
+}
+
+// Polls for completed network IO.
+// Returns list of goroutines that become runnable.
+G*
+runtime·netpoll(bool block)
+{
+	OverlappedEntry entries[64];
+	uint32 wait, qty, key, flags, n, i;
+	int32 errno;
+	net_op *op;
+	G *gp;
+
+	if(iocphandle == INVALID_HANDLE_VALUE)
+		return nil;
+	gp = nil;
+	wait = 0;
+	if(block)
+		wait = INFINITE;
+retry:
+	if(runtime·GetQueuedCompletionStatusEx != nil) {
+		n = nelem(entries) / runtime·gomaxprocs;
+		if(n < 8)
+			n = 8;
+		if(block)
+			g->m->blocked = true;
+		if(runtime·stdcall6(runtime·GetQueuedCompletionStatusEx, iocphandle, (uintptr)entries, n, (uintptr)&n, wait, 0) == 0) {
+			g->m->blocked = false;
+			errno = runtime·getlasterror();
+			if(!block && errno == WAIT_TIMEOUT)
+				return nil;
+			runtime·printf("netpoll: GetQueuedCompletionStatusEx failed (errno=%d)\n", errno);
+			runtime·throw("netpoll: GetQueuedCompletionStatusEx failed");
+		}
+		g->m->blocked = false;
+		for(i = 0; i < n; i++) {
+			op = entries[i].op;
+			errno = 0;
+			qty = 0;
+			if(runtime·stdcall5(runtime·WSAGetOverlappedResult, runtime·netpollfd(op->pd), (uintptr)op, (uintptr)&qty, 0, (uintptr)&flags) == 0)
+				errno = runtime·getlasterror();
+			handlecompletion(&gp, op, errno, qty);
+		}
+	} else {
+		op = nil;
+		errno = 0;
+		qty = 0;
+		if(block)
+			g->m->blocked = true;
+		if(runtime·stdcall5(runtime·GetQueuedCompletionStatus, iocphandle, (uintptr)&qty, (uintptr)&key, (uintptr)&op, wait) == 0) {
+			g->m->blocked = false;
+			errno = runtime·getlasterror();
+			if(!block && errno == WAIT_TIMEOUT)
+				return nil;
+			if(op == nil) {
+				runtime·printf("netpoll: GetQueuedCompletionStatus failed (errno=%d)\n", errno);
+				runtime·throw("netpoll: GetQueuedCompletionStatus failed");
+			}
+			// dequeued failed IO packet, so report that
+		}
+		g->m->blocked = false;
+		handlecompletion(&gp, op, errno, qty);
+	}
+	if(block && gp == nil)
+		goto retry;
+	return gp;
+}
+
+static void
+handlecompletion(G **gpp, net_op *op, int32 errno, uint32 qty)
+{
+	int32 mode;
+
+	if(op == nil)
+		runtime·throw("netpoll: GetQueuedCompletionStatus returned op == nil");
+	mode = op->mode;
+	if(mode != 'r' && mode != 'w') {
+		runtime·printf("netpoll: GetQueuedCompletionStatus returned invalid mode=%d\n", mode);
+		runtime·throw("netpoll: GetQueuedCompletionStatus returned invalid mode");
+	}
+	op->errno = errno;
+	op->qty = qty;
+	runtime·netpollready(gpp, op->pd, mode);
+}
diff --git a/src/runtime/noasm_arm.go b/src/runtime/noasm_arm.go
new file mode 100644
index 000000000..dd3ef8267
--- /dev/null
+++ b/src/runtime/noasm_arm.go
@@ -0,0 +1,54 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Routines that are implemented in assembly in asm_{amd64,386}.s
+// but are implemented in Go for arm.
+
+package runtime
+
+func cmpstring(s1, s2 string) int {
+	l := len(s1)
+	if len(s2) < l {
+		l = len(s2)
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := s1[i], s2[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+	if len(s1) < len(s2) {
+		return -1
+	}
+	if len(s1) > len(s2) {
+		return +1
+	}
+	return 0
+}
+
+func cmpbytes(s1, s2 []byte) int {
+	l := len(s1)
+	if len(s2) < l {
+		l = len(s2)
+	}
+	for i := 0; i < l; i++ {
+		c1, c2 := s1[i], s2[i]
+		if c1 < c2 {
+			return -1
+		}
+		if c1 > c2 {
+			return +1
+		}
+	}
+	if len(s1) < len(s2) {
+		return -1
+	}
+	if len(s1) > len(s2) {
+		return +1
+	}
+	return 0
+}
diff --git a/src/runtime/norace_test.go b/src/runtime/norace_test.go
new file mode 100644
index 000000000..3b171877a
--- /dev/null
+++ b/src/runtime/norace_test.go
@@ -0,0 +1,46 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The file contains tests that can not run under race detector for some reason.
+// +build !race
+
+package runtime_test
+
+import (
+	"runtime"
+	"testing"
+)
+
+// Syscall tests split stack between Entersyscall and Exitsyscall under race detector.
+func BenchmarkSyscall(b *testing.B) {
+	benchmarkSyscall(b, 0, 1)
+}
+
+func BenchmarkSyscallWork(b *testing.B) {
+	benchmarkSyscall(b, 100, 1)
+}
+
+func BenchmarkSyscallExcess(b *testing.B) {
+	benchmarkSyscall(b, 0, 4)
+}
+
+func BenchmarkSyscallExcessWork(b *testing.B) {
+	benchmarkSyscall(b, 100, 4)
+}
+
+func benchmarkSyscall(b *testing.B, work, excess int) {
+	b.SetParallelism(excess)
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 42
+		for pb.Next() {
+			runtime.Entersyscall()
+			for i := 0; i < work; i++ {
+				foo *= 2
+				foo /= 2
+			}
+			runtime.Exitsyscall()
+		}
+		_ = foo
+	})
+}
diff --git a/src/runtime/os_android.c b/src/runtime/os_android.c
new file mode 100644
index 000000000..5805f6871
--- /dev/null
+++ b/src/runtime/os_android.c
@@ -0,0 +1,16 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+// Export the runtime entry point symbol.
+//
+// Used by the app package to start the Go runtime after loading
+// a shared library via JNI. See golang.org/x/mobile/app.
+
+void _rt0_arm_linux1();
+#pragma cgo_export_static _rt0_arm_linux1
+#pragma cgo_export_dynamic _rt0_arm_linux1
diff --git a/src/runtime/os_android.h b/src/runtime/os_android.h
new file mode 100644
index 000000000..c7c1098e8
--- /dev/null
+++ b/src/runtime/os_android.h
@@ -0,0 +1 @@
+#include "os_linux.h"
diff --git a/src/runtime/os_darwin.c b/src/runtime/os_darwin.c
new file mode 100644
index 000000000..bbd29282b
--- /dev/null
+++ b/src/runtime/os_darwin.c
@@ -0,0 +1,567 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+extern SigTab runtime·sigtab[];
+
+static Sigset sigset_none;
+static Sigset sigset_all = ~(Sigset)0;
+
+static void
+unimplemented(int8 *name)
+{
+	runtime·prints(name);
+	runtime·prints(" not implemented\n");
+	*(int32*)1231 = 1231;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	runtime·mach_semrelease(mp->waitsema);
+}
+
+static void
+semacreate(void)
+{
+	g->m->scalararg[0] = runtime·mach_semcreate();
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·semacreate(void)
+{
+	uintptr x;
+	void (*fn)(void);
+	
+	fn = semacreate;
+	runtime·onM(&fn);
+	x = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	return x;
+}
+
+// BSD interface for threading.
+void
+runtime·osinit(void)
+{
+	// bsdthread_register delayed until end of goenvs so that we
+	// can look at the environment first.
+
+	// Use sysctl to fetch hw.ncpu.
+	uint32 mib[2];
+	uint32 out;
+	int32 ret;
+	uintptr nout;
+
+	mib[0] = 6;
+	mib[1] = 3;
+	nout = sizeof out;
+	out = 0;
+	ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
+	if(ret >= 0)
+		runtime·ncpu = out;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	#pragma dataflag NOPTR
+	static byte urandom_data[HashRandomBytes];
+	int32 fd;
+	fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = urandom_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+
+	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
+	// but only if we're not using cgo.  If we are using cgo we need
+	// to let the C pthread library install its own thread-creation callback.
+	if(!runtime·iscgo) {
+		if(runtime·bsdthread_register() != 0) {
+			if(runtime·getenv("DYLD_INSERT_LIBRARIES"))
+				runtime·throw("runtime: bsdthread_register error (unset DYLD_INSERT_LIBRARIES)");
+			runtime·throw("runtime: bsdthread_register error");
+		}
+	}
+
+}
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	int32 errno;
+	Sigset oset;
+
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+	if(0){
+		runtime·printf("newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
+	}
+
+	runtime·sigprocmask(SIG_SETMASK, &sigset_all, &oset);
+	errno = runtime·bsdthread_create(stk, mp, mp->g0, runtime·mstart);
+	runtime·sigprocmask(SIG_SETMASK, &oset, nil);
+
+	if(errno < 0) {
+		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount(), -errno);
+		runtime·throw("runtime.newosproc");
+	}
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	// Initialize signal handling.
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+// Mach IPC, to get at semaphores
+// Definitions are in /usr/include/mach on a Mac.
+
+static void
+macherror(int32 r, int8 *fn)
+{
+	runtime·prints("mach error ");
+	runtime·prints(fn);
+	runtime·prints(": ");
+	runtime·printint(r);
+	runtime·prints("\n");
+	runtime·throw("mach error");
+}
+
+enum
+{
+	DebugMach = 0
+};
+
+static MachNDR zerondr;
+
+#define MACH_MSGH_BITS(a, b) ((a) | ((b)<<8))
+
+static int32
+mach_msg(MachHeader *h,
+	int32 op,
+	uint32 send_size,
+	uint32 rcv_size,
+	uint32 rcv_name,
+	uint32 timeout,
+	uint32 notify)
+{
+	// TODO: Loop on interrupt.
+	return runtime·mach_msg_trap(h, op, send_size, rcv_size, rcv_name, timeout, notify);
+}
+
+// Mach RPC (MIG)
+
+enum
+{
+	MinMachMsg = 48,
+	Reply = 100,
+};
+
+#pragma pack on
+typedef struct CodeMsg CodeMsg;
+struct CodeMsg
+{
+	MachHeader h;
+	MachNDR NDR;
+	int32 code;
+};
+#pragma pack off
+
+static int32
+machcall(MachHeader *h, int32 maxsize, int32 rxsize)
+{
+	uint32 *p;
+	int32 i, ret, id;
+	uint32 port;
+	CodeMsg *c;
+
+	if((port = g->m->machport) == 0){
+		port = runtime·mach_reply_port();
+		g->m->machport = port;
+	}
+
+	h->msgh_bits |= MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, MACH_MSG_TYPE_MAKE_SEND_ONCE);
+	h->msgh_local_port = port;
+	h->msgh_reserved = 0;
+	id = h->msgh_id;
+
+	if(DebugMach){
+		p = (uint32*)h;
+		runtime·prints("send:\t");
+		for(i=0; i<h->msgh_size/sizeof(p[0]); i++){
+			runtime·prints(" ");
+			runtime·printpointer((void*)p[i]);
+			if(i%8 == 7)
+				runtime·prints("\n\t");
+		}
+		if(i%8)
+			runtime·prints("\n");
+	}
+
+	ret = mach_msg(h, MACH_SEND_MSG|MACH_RCV_MSG,
+		h->msgh_size, maxsize, port, 0, 0);
+	if(ret != 0){
+		if(DebugMach){
+			runtime·prints("mach_msg error ");
+			runtime·printint(ret);
+			runtime·prints("\n");
+		}
+		return ret;
+	}
+
+	if(DebugMach){
+		p = (uint32*)h;
+		runtime·prints("recv:\t");
+		for(i=0; i<h->msgh_size/sizeof(p[0]); i++){
+			runtime·prints(" ");
+			runtime·printpointer((void*)p[i]);
+			if(i%8 == 7)
+				runtime·prints("\n\t");
+		}
+		if(i%8)
+			runtime·prints("\n");
+	}
+
+	if(h->msgh_id != id+Reply){
+		if(DebugMach){
+			runtime·prints("mach_msg reply id mismatch ");
+			runtime·printint(h->msgh_id);
+			runtime·prints(" != ");
+			runtime·printint(id+Reply);
+			runtime·prints("\n");
+		}
+		return -303;	// MIG_REPLY_MISMATCH
+	}
+
+	// Look for a response giving the return value.
+	// Any call can send this back with an error,
+	// and some calls only have return values so they
+	// send it back on success too.  I don't quite see how
+	// you know it's one of these and not the full response
+	// format, so just look if the message is right.
+	c = (CodeMsg*)h;
+	if(h->msgh_size == sizeof(CodeMsg)
+	&& !(h->msgh_bits & MACH_MSGH_BITS_COMPLEX)){
+		if(DebugMach){
+			runtime·prints("mig result ");
+			runtime·printint(c->code);
+			runtime·prints("\n");
+		}
+		return c->code;
+	}
+
+	if(h->msgh_size != rxsize){
+		if(DebugMach){
+			runtime·prints("mach_msg reply size mismatch ");
+			runtime·printint(h->msgh_size);
+			runtime·prints(" != ");
+			runtime·printint(rxsize);
+			runtime·prints("\n");
+		}
+		return -307;	// MIG_ARRAY_TOO_LARGE
+	}
+
+	return 0;
+}
+
+
+// Semaphores!
+
+enum
+{
+	Tmach_semcreate = 3418,
+	Rmach_semcreate = Tmach_semcreate + Reply,
+
+	Tmach_semdestroy = 3419,
+	Rmach_semdestroy = Tmach_semdestroy + Reply,
+
+	// Mach calls that get interrupted by Unix signals
+	// return this error code.  We retry them.
+	KERN_ABORTED = 14,
+	KERN_OPERATION_TIMED_OUT = 49,
+};
+
+typedef struct Tmach_semcreateMsg Tmach_semcreateMsg;
+typedef struct Rmach_semcreateMsg Rmach_semcreateMsg;
+typedef struct Tmach_semdestroyMsg Tmach_semdestroyMsg;
+// Rmach_semdestroyMsg = CodeMsg
+
+#pragma pack on
+struct Tmach_semcreateMsg
+{
+	MachHeader h;
+	MachNDR ndr;
+	int32 policy;
+	int32 value;
+};
+
+struct Rmach_semcreateMsg
+{
+	MachHeader h;
+	MachBody body;
+	MachPort semaphore;
+};
+
+struct Tmach_semdestroyMsg
+{
+	MachHeader h;
+	MachBody body;
+	MachPort semaphore;
+};
+#pragma pack off
+
+uint32
+runtime·mach_semcreate(void)
+{
+	union {
+		Tmach_semcreateMsg tx;
+		Rmach_semcreateMsg rx;
+		uint8 pad[MinMachMsg];
+	} m;
+	int32 r;
+
+	m.tx.h.msgh_bits = 0;
+	m.tx.h.msgh_size = sizeof(m.tx);
+	m.tx.h.msgh_remote_port = runtime·mach_task_self();
+	m.tx.h.msgh_id = Tmach_semcreate;
+	m.tx.ndr = zerondr;
+
+	m.tx.policy = 0;	// 0 = SYNC_POLICY_FIFO
+	m.tx.value = 0;
+
+	while((r = machcall(&m.tx.h, sizeof m, sizeof(m.rx))) != 0){
+		if(r == KERN_ABORTED)	// interrupted
+			continue;
+		macherror(r, "semaphore_create");
+	}
+	if(m.rx.body.msgh_descriptor_count != 1)
+		unimplemented("mach_semcreate desc count");
+	return m.rx.semaphore.name;
+}
+
+void
+runtime·mach_semdestroy(uint32 sem)
+{
+	union {
+		Tmach_semdestroyMsg tx;
+		uint8 pad[MinMachMsg];
+	} m;
+	int32 r;
+
+	m.tx.h.msgh_bits = MACH_MSGH_BITS_COMPLEX;
+	m.tx.h.msgh_size = sizeof(m.tx);
+	m.tx.h.msgh_remote_port = runtime·mach_task_self();
+	m.tx.h.msgh_id = Tmach_semdestroy;
+	m.tx.body.msgh_descriptor_count = 1;
+	m.tx.semaphore.name = sem;
+	m.tx.semaphore.disposition = MACH_MSG_TYPE_MOVE_SEND;
+	m.tx.semaphore.type = 0;
+
+	while((r = machcall(&m.tx.h, sizeof m, 0)) != 0){
+		if(r == KERN_ABORTED)	// interrupted
+			continue;
+		macherror(r, "semaphore_destroy");
+	}
+}
+
+// The other calls have simple system call traps in sys_darwin_{amd64,386}.s
+int32 runtime·mach_semaphore_wait(uint32 sema);
+int32 runtime·mach_semaphore_timedwait(uint32 sema, uint32 sec, uint32 nsec);
+int32 runtime·mach_semaphore_signal(uint32 sema);
+int32 runtime·mach_semaphore_signal_all(uint32 sema);
+
+static void
+semasleep(void)
+{
+	int32 r, secs, nsecs;
+	int64 ns;
+	
+	ns = (int64)(uint32)g->m->scalararg[0] | (int64)(uint32)g->m->scalararg[1]<<32;
+	g->m->scalararg[0] = 0;
+	g->m->scalararg[1] = 0;
+
+	if(ns >= 0) {
+		secs = runtime·timediv(ns, 1000000000, &nsecs);
+		r = runtime·mach_semaphore_timedwait(g->m->waitsema, secs, nsecs);
+		if(r == KERN_ABORTED || r == KERN_OPERATION_TIMED_OUT) {
+			g->m->scalararg[0] = -1;
+			return;
+		}
+		if(r != 0)
+			macherror(r, "semaphore_wait");
+		g->m->scalararg[0] = 0;
+		return;
+	}
+	while((r = runtime·mach_semaphore_wait(g->m->waitsema)) != 0) {
+		if(r == KERN_ABORTED)	// interrupted
+			continue;
+		macherror(r, "semaphore_wait");
+	}
+	g->m->scalararg[0] = 0;
+	return;
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	int32 r;
+	void (*fn)(void);
+
+	g->m->scalararg[0] = (uint32)ns;
+	g->m->scalararg[1] = (uint32)(ns>>32);
+	fn = semasleep;
+	runtime·onM(&fn);
+	r = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	return r;
+}
+
+static int32 mach_semrelease_errno;
+
+static void
+mach_semrelease_fail(void)
+{
+	macherror(mach_semrelease_errno, "semaphore_signal");
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·mach_semrelease(uint32 sem)
+{
+	int32 r;
+	void (*fn)(void);
+
+	while((r = runtime·mach_semaphore_signal(sem)) != 0) {
+		if(r == KERN_ABORTED)	// interrupted
+			continue;
+		
+		// mach_semrelease must be completely nosplit,
+		// because it is called from Go code.
+		// If we're going to die, start that process on the m stack
+		// to avoid a Go stack split.
+		// Only do that if we're actually running on the g stack.
+		// We might be on the gsignal stack, and if so, onM will abort.
+		// We use the global variable instead of scalararg because
+		// we might be on the gsignal stack, having interrupted a
+		// normal call to onM. It doesn't quite matter, since the
+		// program is about to die, but better to be clean.
+		mach_semrelease_errno = r;
+		fn = mach_semrelease_fail;
+		if(g == g->m->curg)
+			runtime·onM(&fn);
+		else
+			fn();
+	}
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·osyield(void)
+{
+	runtime·usleep(1);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	// NOTE(rsc): Could use getrlimit here,
+	// like on FreeBSD or Linux, but Darwin doesn't enforce
+	// ulimit -v, so it's unclear why we'd try to stay within
+	// the limit.
+	return 0;
+}
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+		
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask = ~(uintptr)0;
+	sa.sa_tramp = (void*)runtime·sigtramp;	// runtime·sigtramp's job is to call into real handler
+	*(uintptr*)sa.__sigaction_u = (uintptr)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	runtime·sigaction(i, nil, &sa);
+	return *(void**)sa.__sigaction_u;
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	StackT st;
+
+	st.ss_sp = (void*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
new file mode 100644
index 000000000..4327ced91
--- /dev/null
+++ b/src/runtime/os_darwin.go
@@ -0,0 +1,24 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func bsdthread_create(stk, mm, gg, fn unsafe.Pointer) int32
+func bsdthread_register() int32
+func mach_msg_trap(h unsafe.Pointer, op int32, send_size, rcv_size, rcv_name, timeout, notify uint32) int32
+func mach_reply_port() uint32
+func mach_task_self() uint32
+func mach_thread_self() uint32
+func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+func sigprocmask(sig int32, new, old unsafe.Pointer)
+func sigaction(mode uint32, new, old unsafe.Pointer)
+func sigaltstack(new, old unsafe.Pointer)
+func sigtramp()
+func setitimer(mode int32, new, old unsafe.Pointer)
+func mach_semaphore_wait(sema uint32) int32
+func mach_semaphore_timedwait(sema, sec, nsec uint32) int32
+func mach_semaphore_signal(sema uint32) int32
+func mach_semaphore_signal_all(sema uint32) int32
diff --git a/src/runtime/os_darwin.h b/src/runtime/os_darwin.h
new file mode 100644
index 000000000..e8bb45daf
--- /dev/null
+++ b/src/runtime/os_darwin.h
@@ -0,0 +1,43 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+typedef byte* kevent_udata;
+
+int32	runtime·bsdthread_create(void*, M*, G*, void(*)(void));
+int32	runtime·bsdthread_register(void);
+int32	runtime·mach_msg_trap(MachHeader*, int32, uint32, uint32, uint32, uint32, uint32);
+uint32	runtime·mach_reply_port(void);
+int32	runtime·mach_semacquire(uint32, int64);
+uint32	runtime·mach_semcreate(void);
+void	runtime·mach_semdestroy(uint32);
+void	runtime·mach_semrelease(uint32);
+void	runtime·mach_semreset(uint32);
+uint32	runtime·mach_task_self(void);
+uint32	runtime·mach_task_self(void);
+uint32	runtime·mach_thread_self(void);
+uint32	runtime·mach_thread_self(void);
+int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
+
+typedef uint32 Sigset;
+void	runtime·sigprocmask(int32, Sigset*, Sigset*);
+void	runtime·unblocksignals(void);
+
+struct SigactionT;
+void	runtime·sigaction(uintptr, struct SigactionT*, struct SigactionT*);
+
+struct StackT;
+void	runtime·sigaltstack(struct StackT*, struct StackT*);
+void	runtime·sigtramp(void);
+void	runtime·sigpanic(void);
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+
+
+enum {
+	NSIG = 32,
+	SI_USER = 0, /* empirically true, but not what headers say */
+	SIG_BLOCK = 1,
+	SIG_UNBLOCK = 2,
+	SIG_SETMASK = 3,
+	SS_DISABLE = 4,
+};
diff --git a/src/runtime/os_dragonfly.c b/src/runtime/os_dragonfly.c
new file mode 100644
index 000000000..e372205ec
--- /dev/null
+++ b/src/runtime/os_dragonfly.c
@@ -0,0 +1,312 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+extern SigTab runtime·sigtab[];
+extern int32 runtime·sys_umtx_sleep(uint32*, int32, int32);
+extern int32 runtime·sys_umtx_wakeup(uint32*, int32);
+
+// From DragonFly's <sys/sysctl.h>
+#define	CTL_HW	6
+#define	HW_NCPU	3
+
+static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0, ~(uint32)0, ~(uint32)0, };
+
+static int32
+getncpu(void)
+{
+	uint32 mib[2];
+	uint32 out;
+	int32 ret;
+	uintptr nout;
+
+	// Fetch hw.ncpu via sysctl.
+	mib[0] = CTL_HW;
+	mib[1] = HW_NCPU;
+	nout = sizeof out;
+	out = 0;
+	ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
+	if(ret >= 0)
+		return out;
+	else
+		return 1;
+}
+
+static void futexsleep(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·futexsleep(uint32 *addr, uint32 val, int64 ns)
+{
+	void (*fn)(void);
+
+	g->m->ptrarg[0] = addr;
+	g->m->scalararg[0] = val;
+	g->m->ptrarg[1] = &ns;
+
+	fn = futexsleep;
+	runtime·onM(&fn);
+}
+
+static void
+futexsleep(void)
+{
+	uint32 *addr;
+	uint32 val;
+	int64 ns;
+	int32 timeout = 0;
+	int32 ret;
+
+	addr = g->m->ptrarg[0];
+	val = g->m->scalararg[0];
+	ns = *(int64*)g->m->ptrarg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->scalararg[0] = 0;
+	g->m->ptrarg[1] = nil;
+
+	if(ns >= 0) {
+		// The timeout is specified in microseconds - ensure that we
+		// do not end up dividing to zero, which would put us to sleep
+		// indefinitely...
+		timeout = runtime·timediv(ns, 1000, nil);
+		if(timeout == 0)
+			timeout = 1;
+	}
+
+	// sys_umtx_sleep will return EWOULDBLOCK (EAGAIN) when the timeout
+	// expires or EBUSY if the mutex value does not match. 
+	ret = runtime·sys_umtx_sleep(addr, val, timeout);
+	if(ret >= 0 || ret == -EINTR || ret == -EAGAIN || ret == -EBUSY)
+		return;
+
+	runtime·prints("umtx_wait addr=");
+	runtime·printpointer(addr);
+	runtime·prints(" val=");
+	runtime·printint(val);
+	runtime·prints(" ret=");
+	runtime·printint(ret);
+	runtime·prints("\n");
+	*(int32*)0x1005 = 0x1005;
+}
+
+static void badfutexwakeup(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·futexwakeup(uint32 *addr, uint32 cnt)
+{
+	int32 ret;
+	void (*fn)(void);
+
+	ret = runtime·sys_umtx_wakeup(addr, cnt);
+	if(ret >= 0)
+		return;
+
+	g->m->ptrarg[0] = addr;
+	g->m->scalararg[0] = ret;
+	fn = badfutexwakeup;
+	if(g == g->m->gsignal)
+		fn();
+	else
+		runtime·onM(&fn);
+	*(int32*)0x1006 = 0x1006;
+}
+
+static void
+badfutexwakeup(void)
+{
+	void *addr;
+	int32 ret;
+	
+	addr = g->m->ptrarg[0];
+	ret = g->m->scalararg[0];
+	runtime·printf("umtx_wake addr=%p ret=%d\n", addr, ret);
+}
+
+void runtime·lwp_start(void*);
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	Lwpparams params;
+	Sigset oset;
+
+	if(0){
+		runtime·printf("newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
+	}
+
+	runtime·sigprocmask(&sigset_all, &oset);
+	runtime·memclr((byte*)&params, sizeof params);
+
+	params.func = runtime·lwp_start;
+	params.arg = (byte*)mp;
+	params.stack = (byte*)stk;
+	params.tid1 = (int32*)&mp->procid;
+	params.tid2 = nil;
+
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+
+	runtime·lwp_create(&params);
+	runtime·sigprocmask(&oset, nil);
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getncpu();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	#pragma dataflag NOPTR
+	static byte urandom_data[HashRandomBytes];
+	int32 fd;
+	fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = urandom_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	// Initialize signal handling
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	runtime·sigprocmask(&sigset_none, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	Rlimit rl;
+	extern byte runtime·text[], runtime·end[];
+	uintptr used;
+	
+	if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+		return 0;
+	if(rl.rlim_cur >= 0x7fffffff)
+		return 0;
+
+	// Estimate our VM footprint excluding the heap.
+	// Not an exact science: use size of binary plus
+	// some room for thread stacks.
+	used = runtime·end - runtime·text + (64<<20);
+	if(used >= rl.rlim_cur)
+		return 0;
+
+	// If there's not at least 16 MB left, we're probably
+	// not going to be able to do much.  Treat as no limit.
+	rl.rlim_cur -= used;
+	if(rl.rlim_cur < (16<<20))
+		return 0;
+
+	return rl.rlim_cur - used;
+}
+
+extern void runtime·sigtramp(void);
+
+typedef struct sigaction {
+	union {
+		void    (*__sa_handler)(int32);
+		void    (*__sa_sigaction)(int32, Siginfo*, void *);
+	} __sigaction_u;		/* signal handler */
+	int32	sa_flags;		/* see signal options below */
+	Sigset	sa_mask;		/* signal mask to apply */
+} SigactionT;
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask.__bits[0] = ~(uint32)0;
+	sa.sa_mask.__bits[1] = ~(uint32)0;
+	sa.sa_mask.__bits[2] = ~(uint32)0;
+	sa.sa_mask.__bits[3] = ~(uint32)0;
+	if(fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa.__sigaction_u.__sa_sigaction = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	runtime·sigaction(i, nil, &sa);
+	if((void*)sa.__sigaction_u.__sa_sigaction == runtime·sigtramp)
+		return runtime·sighandler;
+	return (void*)sa.__sigaction_u.__sa_sigaction;
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	StackT st;
+
+	st.ss_sp = (void*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·sigprocmask(&sigset_none, nil);
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_dragonfly.go b/src/runtime/os_dragonfly.go
new file mode 100644
index 000000000..cdaa06986
--- /dev/null
+++ b/src/runtime/os_dragonfly.go
@@ -0,0 +1,20 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func lwp_create(param unsafe.Pointer) int32
+func sigaltstack(new, old unsafe.Pointer)
+func sigaction(sig int32, new, old unsafe.Pointer)
+func sigprocmask(new, old unsafe.Pointer)
+func setitimer(mode int32, new, old unsafe.Pointer)
+func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+func getrlimit(kind int32, limit unsafe.Pointer) int32
+func raise(sig int32)
+func sys_umtx_sleep(addr unsafe.Pointer, val, timeout int32) int32
+func sys_umtx_wakeup(addr unsafe.Pointer, val int32) int32
+
+const stackSystem = 0
diff --git a/src/runtime/os_dragonfly.h b/src/runtime/os_dragonfly.h
new file mode 100644
index 000000000..389736a32
--- /dev/null
+++ b/src/runtime/os_dragonfly.h
@@ -0,0 +1,30 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+typedef byte* kevent_udata;
+
+int32	runtime·lwp_create(Lwpparams*);
+void	runtime·sigpanic(void);
+void	runtime·sigaltstack(SigaltstackT*, SigaltstackT*);
+struct	sigaction;
+void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
+void	runtime·sigprocmask(Sigset *, Sigset *);
+void	runtime·unblocksignals(void);
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
+
+enum {
+	NSIG = 33,
+	SI_USER = 0x10001,
+	SS_DISABLE = 4,
+	RLIMIT_AS = 10,
+};
+
+typedef struct Rlimit Rlimit;
+struct Rlimit {
+	int64	rlim_cur;
+	int64	rlim_max;
+};
+int32	runtime·getrlimit(int32, Rlimit*);
diff --git a/src/runtime/os_freebsd.c b/src/runtime/os_freebsd.c
new file mode 100644
index 000000000..a513cb604
--- /dev/null
+++ b/src/runtime/os_freebsd.c
@@ -0,0 +1,320 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+extern SigTab runtime·sigtab[];
+extern int32 runtime·sys_umtx_op(uint32*, int32, uint32, void*, void*);
+
+// From FreeBSD's <sys/sysctl.h>
+#define	CTL_HW	6
+#define	HW_NCPU	3
+
+static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0, ~(uint32)0, ~(uint32)0, };
+
+static int32
+getncpu(void)
+{
+	uint32 mib[2];
+	uint32 out;
+	int32 ret;
+	uintptr nout;
+
+	// Fetch hw.ncpu via sysctl.
+	mib[0] = CTL_HW;
+	mib[1] = HW_NCPU;
+	nout = sizeof out;
+	out = 0;
+	ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
+	if(ret >= 0)
+		return out;
+	else
+		return 1;
+}
+
+// FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
+// thus the code is largely similar. See linux/thread.c and lock_futex.c for comments.
+
+static void futexsleep(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·futexsleep(uint32 *addr, uint32 val, int64 ns)
+{
+	void (*fn)(void);
+
+	g->m->ptrarg[0] = addr;
+	g->m->scalararg[0] = val;
+	g->m->ptrarg[1] = &ns;
+
+	fn = futexsleep;
+	runtime·onM(&fn);
+}
+
+static void
+futexsleep(void)
+{
+	uint32 *addr;
+	uint32 val;
+	int64 ns;
+	int32 ret;
+	Timespec ts;
+	
+	addr = g->m->ptrarg[0];
+	val = g->m->scalararg[0];
+	ns = *(int64*)g->m->ptrarg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->scalararg[0] = 0;
+	g->m->ptrarg[1] = nil;
+
+	if(ns < 0) {
+		ret = runtime·sys_umtx_op(addr, UMTX_OP_WAIT_UINT_PRIVATE, val, nil, nil);
+		if(ret >= 0 || ret == -EINTR)
+			return;
+		goto fail;
+	}
+	// NOTE: tv_nsec is int64 on amd64, so this assumes a little-endian system.
+	ts.tv_nsec = 0;
+	ts.tv_sec = runtime·timediv(ns, 1000000000, (int32*)&ts.tv_nsec);
+	ret = runtime·sys_umtx_op(addr, UMTX_OP_WAIT_UINT_PRIVATE, val, nil, &ts);
+	if(ret >= 0 || ret == -EINTR)
+		return;
+
+fail:
+	runtime·prints("umtx_wait addr=");
+	runtime·printpointer(addr);
+	runtime·prints(" val=");
+	runtime·printint(val);
+	runtime·prints(" ret=");
+	runtime·printint(ret);
+	runtime·prints("\n");
+	*(int32*)0x1005 = 0x1005;
+}
+
+static void badfutexwakeup(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·futexwakeup(uint32 *addr, uint32 cnt)
+{
+	int32 ret;
+	void (*fn)(void);
+
+	ret = runtime·sys_umtx_op(addr, UMTX_OP_WAKE_PRIVATE, cnt, nil, nil);
+	if(ret >= 0)
+		return;
+
+	g->m->ptrarg[0] = addr;
+	g->m->scalararg[0] = ret;
+	fn = badfutexwakeup;
+	if(g == g->m->gsignal)
+		fn();
+	else
+		runtime·onM(&fn);
+	*(int32*)0x1006 = 0x1006;
+}
+
+static void
+badfutexwakeup(void)
+{
+	void *addr;
+	int32 ret;
+	
+	addr = g->m->ptrarg[0];
+	ret = g->m->scalararg[0];
+	runtime·printf("umtx_wake addr=%p ret=%d\n", addr, ret);
+}
+
+void runtime·thr_start(void*);
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	ThrParam param;
+	Sigset oset;
+
+	if(0){
+		runtime·printf("newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
+	}
+
+	runtime·sigprocmask(&sigset_all, &oset);
+	runtime·memclr((byte*)&param, sizeof param);
+
+	param.start_func = runtime·thr_start;
+	param.arg = (byte*)mp;
+	
+	// NOTE(rsc): This code is confused. stackbase is the top of the stack
+	// and is equal to stk. However, it's working, so I'm not changing it.
+	param.stack_base = (void*)mp->g0->stack.hi;
+	param.stack_size = (byte*)stk - (byte*)mp->g0->stack.hi;
+
+	param.child_tid = (void*)&mp->procid;
+	param.parent_tid = nil;
+	param.tls_base = (void*)&mp->tls[0];
+	param.tls_size = sizeof mp->tls;
+
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+
+	runtime·thr_new(&param, sizeof param);
+	runtime·sigprocmask(&oset, nil);
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getncpu();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	#pragma dataflag NOPTR
+	static byte urandom_data[HashRandomBytes];
+	int32 fd;
+	fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = urandom_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	// Initialize signal handling
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	runtime·sigprocmask(&sigset_none, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	Rlimit rl;
+	extern byte runtime·text[], runtime·end[];
+	uintptr used;
+	
+	if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+		return 0;
+	if(rl.rlim_cur >= 0x7fffffff)
+		return 0;
+
+	// Estimate our VM footprint excluding the heap.
+	// Not an exact science: use size of binary plus
+	// some room for thread stacks.
+	used = runtime·end - runtime·text + (64<<20);
+	if(used >= rl.rlim_cur)
+		return 0;
+
+	// If there's not at least 16 MB left, we're probably
+	// not going to be able to do much.  Treat as no limit.
+	rl.rlim_cur -= used;
+	if(rl.rlim_cur < (16<<20))
+		return 0;
+
+	return rl.rlim_cur - used;
+}
+
+extern void runtime·sigtramp(void);
+
+typedef struct sigaction {
+	union {
+		void    (*__sa_handler)(int32);
+		void    (*__sa_sigaction)(int32, Siginfo*, void *);
+	} __sigaction_u;		/* signal handler */
+	int32	sa_flags;		/* see signal options below */
+	Sigset	sa_mask;		/* signal mask to apply */
+} SigactionT;
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask.__bits[0] = ~(uint32)0;
+	sa.sa_mask.__bits[1] = ~(uint32)0;
+	sa.sa_mask.__bits[2] = ~(uint32)0;
+	sa.sa_mask.__bits[3] = ~(uint32)0;
+	if(fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa.__sigaction_u.__sa_sigaction = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	runtime·sigaction(i, nil, &sa);
+	if((void*)sa.__sigaction_u.__sa_sigaction == runtime·sigtramp)
+		return runtime·sighandler;
+	return (void*)sa.__sigaction_u.__sa_sigaction;
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	StackT st;
+
+	st.ss_sp = (void*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·sigprocmask(&sigset_none, nil);
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
new file mode 100644
index 000000000..59708049c
--- /dev/null
+++ b/src/runtime/os_freebsd.go
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func thr_new(param unsafe.Pointer, size int32)
+func sigaltstack(new, old unsafe.Pointer)
+func sigaction(sig int32, new, old unsafe.Pointer)
+func sigprocmask(new, old unsafe.Pointer)
+func setitimer(mode int32, new, old unsafe.Pointer)
+func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+func getrlimit(kind int32, limit unsafe.Pointer) int32
+func raise(sig int32)
+func sys_umtx_op(addr unsafe.Pointer, mode int32, val uint32, ptr2, ts unsafe.Pointer) int32
diff --git a/src/runtime/os_freebsd.h b/src/runtime/os_freebsd.h
new file mode 100644
index 000000000..b86bb393c
--- /dev/null
+++ b/src/runtime/os_freebsd.h
@@ -0,0 +1,29 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+typedef byte* kevent_udata;
+
+int32	runtime·thr_new(ThrParam*, int32);
+void	runtime·sigpanic(void);
+void	runtime·sigaltstack(SigaltstackT*, SigaltstackT*);
+struct	sigaction;
+void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
+void	runtime·sigprocmask(Sigset *, Sigset *);
+void	runtime·unblocksignals(void);
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
+
+enum {
+	SS_DISABLE = 4,
+	NSIG = 33,
+	SI_USER = 0x10001,
+	RLIMIT_AS = 10,
+};
+
+typedef struct Rlimit Rlimit;
+struct Rlimit {
+	int64	rlim_cur;
+	int64	rlim_max;
+};
+int32	runtime·getrlimit(int32, Rlimit*);
diff --git a/src/runtime/os_freebsd_arm.c b/src/runtime/os_freebsd_arm.c
new file mode 100644
index 000000000..2f2d7767f
--- /dev/null
+++ b/src/runtime/os_freebsd_arm.c
@@ -0,0 +1,24 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "textflag.h"
+
+void
+runtime·checkgoarm(void)
+{
+	// TODO(minux)
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·cputicks(void)
+{
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// TODO: need more entropy to better seed fastrand1.
+	return runtime·nanotime();
+}
diff --git a/src/runtime/os_linux.c b/src/runtime/os_linux.c
new file mode 100644
index 000000000..0d8ffc995
--- /dev/null
+++ b/src/runtime/os_linux.c
@@ -0,0 +1,342 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+extern SigTab runtime·sigtab[];
+
+static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0 };
+
+// Linux futex.
+//
+//	futexsleep(uint32 *addr, uint32 val)
+//	futexwakeup(uint32 *addr)
+//
+// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+// Futexwakeup wakes up threads sleeping on addr.
+// Futexsleep is allowed to wake up spuriously.
+
+enum
+{
+	FUTEX_WAIT = 0,
+	FUTEX_WAKE = 1,
+};
+
+// Atomically,
+//	if(*addr == val) sleep
+// Might be woken up spuriously; that's allowed.
+// Don't sleep longer than ns; ns < 0 means forever.
+#pragma textflag NOSPLIT
+void
+runtime·futexsleep(uint32 *addr, uint32 val, int64 ns)
+{
+	Timespec ts;
+
+	// Some Linux kernels have a bug where futex of
+	// FUTEX_WAIT returns an internal error code
+	// as an errno.  Libpthread ignores the return value
+	// here, and so can we: as it says a few lines up,
+	// spurious wakeups are allowed.
+
+	if(ns < 0) {
+		runtime·futex(addr, FUTEX_WAIT, val, nil, nil, 0);
+		return;
+	}
+	// NOTE: tv_nsec is int64 on amd64, so this assumes a little-endian system.
+	ts.tv_nsec = 0;
+	ts.tv_sec = runtime·timediv(ns, 1000000000LL, (int32*)&ts.tv_nsec);
+	runtime·futex(addr, FUTEX_WAIT, val, &ts, nil, 0);
+}
+
+static void badfutexwakeup(void);
+
+// If any procs are sleeping on addr, wake up at most cnt.
+#pragma textflag NOSPLIT
+void
+runtime·futexwakeup(uint32 *addr, uint32 cnt)
+{
+	int64 ret;
+	void (*fn)(void);
+
+	ret = runtime·futex(addr, FUTEX_WAKE, cnt, nil, nil, 0);
+	if(ret >= 0)
+		return;
+
+	// I don't know that futex wakeup can return
+	// EAGAIN or EINTR, but if it does, it would be
+	// safe to loop and call futex again.
+	g->m->ptrarg[0] = addr;
+	g->m->scalararg[0] = (int32)ret; // truncated but fine
+	fn = badfutexwakeup;
+	if(g == g->m->gsignal)
+		fn();
+	else
+		runtime·onM(&fn);
+	*(int32*)0x1006 = 0x1006;
+}
+
+static void
+badfutexwakeup(void)
+{
+	void *addr;
+	int64 ret;
+	
+	addr = g->m->ptrarg[0];
+	ret = (int32)g->m->scalararg[0];
+	runtime·printf("futexwakeup addr=%p returned %D\n", addr, ret);
+}
+
+extern runtime·sched_getaffinity(uintptr pid, uintptr len, uintptr *buf);
+static int32
+getproccount(void)
+{
+	uintptr buf[16], t;
+	int32 r, cnt, i;
+
+	cnt = 0;
+	r = runtime·sched_getaffinity(0, sizeof(buf), buf);
+	if(r > 0)
+	for(i = 0; i < r/sizeof(buf[0]); i++) {
+		t = buf[i];
+		t = t - ((t >> 1) & 0x5555555555555555ULL);
+		t = (t & 0x3333333333333333ULL) + ((t >> 2) & 0x3333333333333333ULL);
+		cnt += (int32)((((t + (t >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
+	}
+
+	return cnt ? cnt : 1;
+}
+
+// Clone, the Linux rfork.
+enum
+{
+	CLONE_VM = 0x100,
+	CLONE_FS = 0x200,
+	CLONE_FILES = 0x400,
+	CLONE_SIGHAND = 0x800,
+	CLONE_PTRACE = 0x2000,
+	CLONE_VFORK = 0x4000,
+	CLONE_PARENT = 0x8000,
+	CLONE_THREAD = 0x10000,
+	CLONE_NEWNS = 0x20000,
+	CLONE_SYSVSEM = 0x40000,
+	CLONE_SETTLS = 0x80000,
+	CLONE_PARENT_SETTID = 0x100000,
+	CLONE_CHILD_CLEARTID = 0x200000,
+	CLONE_UNTRACED = 0x800000,
+	CLONE_CHILD_SETTID = 0x1000000,
+	CLONE_STOPPED = 0x2000000,
+	CLONE_NEWUTS = 0x4000000,
+	CLONE_NEWIPC = 0x8000000,
+};
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	int32 ret;
+	int32 flags;
+	Sigset oset;
+
+	/*
+	 * note: strace gets confused if we use CLONE_PTRACE here.
+	 */
+	flags = CLONE_VM	/* share memory */
+		| CLONE_FS	/* share cwd, etc */
+		| CLONE_FILES	/* share fd table */
+		| CLONE_SIGHAND	/* share sig handler table */
+		| CLONE_THREAD	/* revisit - okay for now */
+		;
+
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+	if(0){
+		runtime·printf("newosproc stk=%p m=%p g=%p clone=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, runtime·clone, mp->id, (int32)mp->tls[0], &mp);
+	}
+
+	// Disable signals during clone, so that the new thread starts
+	// with signals disabled.  It will enable them in minit.
+	runtime·rtsigprocmask(SIG_SETMASK, &sigset_all, &oset, sizeof oset);
+	ret = runtime·clone(flags, stk, mp, mp->g0, runtime·mstart);
+	runtime·rtsigprocmask(SIG_SETMASK, &oset, nil, sizeof oset);
+
+	if(ret < 0) {
+		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount(), -ret);
+		runtime·throw("runtime.newosproc");
+	}
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getproccount();
+}
+
+// Random bytes initialized at startup.  These come
+// from the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.c).
+byte*	runtime·startup_random_data;
+uint32	runtime·startup_random_data_len;
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	if(runtime·startup_random_data != nil) {
+		*rnd = runtime·startup_random_data;
+		*rnd_len = runtime·startup_random_data_len;
+	} else {
+		#pragma dataflag NOPTR
+		static byte urandom_data[HashRandomBytes];
+		int32 fd;
+		fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+		if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+			*rnd = urandom_data;
+			*rnd_len = HashRandomBytes;
+		} else {
+			*rnd = nil;
+			*rnd_len = 0;
+		}
+		runtime·close(fd);
+	}
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	// Initialize signal handling.
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	runtime·rtsigprocmask(SIG_SETMASK, &sigset_none, nil, sizeof(Sigset));
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	Rlimit rl;
+	extern byte runtime·text[], runtime·end[];
+	uintptr used;
+
+	if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+		return 0;
+	if(rl.rlim_cur >= 0x7fffffff)
+		return 0;
+
+	// Estimate our VM footprint excluding the heap.
+	// Not an exact science: use size of binary plus
+	// some room for thread stacks.
+	used = runtime·end - runtime·text + (64<<20);
+	if(used >= rl.rlim_cur)
+		return 0;
+
+	// If there's not at least 16 MB left, we're probably
+	// not going to be able to do much.  Treat as no limit.
+	rl.rlim_cur -= used;
+	if(rl.rlim_cur < (16<<20))
+		return 0;
+
+	return rl.rlim_cur - used;
+}
+
+#ifdef GOARCH_386
+#define sa_handler k_sa_handler
+#endif
+
+/*
+ * This assembler routine takes the args from registers, puts them on the stack,
+ * and calls sighandler().
+ */
+extern void runtime·sigtramp(void);
+extern void runtime·sigreturn(void);	// calls rt_sigreturn, only used with SA_RESTORER
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTORER;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask = ~0ULL;
+	// Although Linux manpage says "sa_restorer element is obsolete and
+	// should not be used". x86_64 kernel requires it. Only use it on
+	// x86.
+#ifdef GOARCH_386
+	sa.sa_restorer = (void*)runtime·sigreturn;
+#endif
+#ifdef GOARCH_amd64
+	sa.sa_restorer = (void*)runtime·sigreturn;
+#endif
+	if(fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa.sa_handler = fn;
+	if(runtime·rt_sigaction(i, &sa, nil, sizeof(sa.sa_mask)) != 0)
+		runtime·throw("rt_sigaction failure");
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	if(runtime·rt_sigaction(i, nil, &sa, sizeof(sa.sa_mask)) != 0)
+		runtime·throw("rt_sigaction read failure");
+	if((void*)sa.sa_handler == runtime·sigtramp)
+		return runtime·sighandler;
+	return (void*)sa.sa_handler;
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	SigaltstackT st;
+
+	st.ss_sp = p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·rtsigprocmask(SIG_SETMASK, &sigset_none, nil, sizeof sigset_none);
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
new file mode 100644
index 000000000..41123ad57
--- /dev/null
+++ b/src/runtime/os_linux.go
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
+func clone(flags int32, stk, mm, gg, fn unsafe.Pointer) int32
+func rt_sigaction(sig uintptr, new, old unsafe.Pointer, size uintptr) int32
+func sigaltstack(new, old unsafe.Pointer)
+func setitimer(mode int32, new, old unsafe.Pointer)
+func rtsigprocmask(sig int32, new, old unsafe.Pointer, size int32)
+func getrlimit(kind int32, limit unsafe.Pointer) int32
+func raise(sig int32)
+func sched_getaffinity(pid, len uintptr, buf *uintptr) int32
diff --git a/src/runtime/os_linux.h b/src/runtime/os_linux.h
new file mode 100644
index 000000000..75606d615
--- /dev/null
+++ b/src/runtime/os_linux.h
@@ -0,0 +1,41 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+// Linux-specific system calls
+int32	runtime·futex(uint32*, int32, uint32, Timespec*, uint32*, uint32);
+int32	runtime·clone(int32, void*, M*, G*, void(*)(void));
+
+struct SigactionT;
+int32	runtime·rt_sigaction(uintptr, struct SigactionT*, void*, uintptr);
+
+void	runtime·sigaltstack(SigaltstackT*, SigaltstackT*);
+void	runtime·sigpanic(void);
+void runtime·setitimer(int32, Itimerval*, Itimerval*);
+
+enum {
+	SS_DISABLE = 2,
+	NSIG = 65,
+	SI_USER = 0,
+	SIG_SETMASK = 2,
+	RLIMIT_AS = 9,
+};
+
+// It's hard to tease out exactly how big a Sigset is, but
+// rt_sigprocmask crashes if we get it wrong, so if binaries
+// are running, this is right.
+typedef struct Sigset Sigset;
+struct Sigset
+{
+	uint32 mask[2];
+};
+void	runtime·rtsigprocmask(int32, Sigset*, Sigset*, int32);
+void	runtime·unblocksignals(void);
+
+typedef struct Rlimit Rlimit;
+struct Rlimit {
+	uintptr	rlim_cur;
+	uintptr	rlim_max;
+};
+int32	runtime·getrlimit(int32, Rlimit*);
diff --git a/src/runtime/os_linux_386.c b/src/runtime/os_linux_386.c
new file mode 100644
index 000000000..dc89d04e2
--- /dev/null
+++ b/src/runtime/os_linux_386.c
@@ -0,0 +1,38 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "textflag.h"
+
+#define AT_NULL		0
+#define AT_RANDOM	25
+#define AT_SYSINFO	32
+extern uint32 runtime·_vdso;
+
+#pragma textflag NOSPLIT
+void
+runtime·linux_setup_vdso(int32 argc, byte **argv)
+{
+	byte **envp;
+	uint32 *auxv;
+
+	// skip envp to get to ELF auxiliary vector.
+	for(envp = &argv[argc+1]; *envp != nil; envp++)
+		;
+	envp++;
+	
+	for(auxv=(uint32*)envp; auxv[0] != AT_NULL; auxv += 2) {
+		if(auxv[0] == AT_SYSINFO) {
+			runtime·_vdso = auxv[1];
+			continue;
+		}
+		if(auxv[0] == AT_RANDOM) {
+			runtime·startup_random_data = (byte*)auxv[1];
+			runtime·startup_random_data_len = 16;
+			continue;
+		}
+	}
+}
diff --git a/src/runtime/os_linux_arm.c b/src/runtime/os_linux_arm.c
new file mode 100644
index 000000000..e3eda7c2d
--- /dev/null
+++ b/src/runtime/os_linux_arm.c
@@ -0,0 +1,80 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "textflag.h"
+
+#define AT_NULL		0
+#define AT_PLATFORM	15 // introduced in at least 2.6.11
+#define AT_HWCAP	16 // introduced in at least 2.6.11
+#define AT_RANDOM	25 // introduced in 2.6.29
+#define HWCAP_VFP	(1 << 6) // introduced in at least 2.6.11
+#define HWCAP_VFPv3	(1 << 13) // introduced in 2.6.30
+static uint32 runtime·randomNumber;
+uint8  runtime·armArch = 6;	// we default to ARMv6
+uint32 runtime·hwcap;	// set by setup_auxv
+extern uint8  runtime·goarm;	// set by 5l
+
+void
+runtime·checkgoarm(void)
+{
+	if(runtime·goarm > 5 && !(runtime·hwcap & HWCAP_VFP)) {
+		runtime·printf("runtime: this CPU has no floating point hardware, so it cannot run\n");
+		runtime·printf("this GOARM=%d binary. Recompile using GOARM=5.\n", runtime·goarm);
+		runtime·exit(1);
+	}
+	if(runtime·goarm > 6 && !(runtime·hwcap & HWCAP_VFPv3)) {
+		runtime·printf("runtime: this CPU has no VFPv3 floating point hardware, so it cannot run\n");
+		runtime·printf("this GOARM=%d binary. Recompile using GOARM=6.\n", runtime·goarm);
+		runtime·exit(1);
+	}
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·setup_auxv(int32 argc, byte **argv)
+{
+	byte **envp;
+	byte *rnd;
+	uint32 *auxv;
+	uint32 t;
+
+	// skip envp to get to ELF auxiliary vector.
+	for(envp = &argv[argc+1]; *envp != nil; envp++)
+		;
+	envp++;
+	
+	for(auxv=(uint32*)envp; auxv[0] != AT_NULL; auxv += 2) {
+		switch(auxv[0]) {
+		case AT_RANDOM: // kernel provided 16-byte worth of random data
+			if(auxv[1]) {
+				rnd = (byte*)auxv[1];
+				runtime·randomNumber = rnd[4] | rnd[5]<<8 | rnd[6]<<16 | rnd[7]<<24;
+			}
+			break;
+		case AT_PLATFORM: // v5l, v6l, v7l
+			if(auxv[1]) {
+				t = *(uint8*)(auxv[1]+1);
+				if(t >= '5' && t <= '7')
+					runtime·armArch = t - '0';
+			}
+			break;
+		case AT_HWCAP: // CPU capability bit flags
+			runtime·hwcap = auxv[1];
+			break;
+		}
+	}
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·cputicks(void)
+{
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// runtime·randomNumber provides better seeding of fastrand1.
+	return runtime·nanotime() + runtime·randomNumber;
+}
diff --git a/src/runtime/os_nacl.c b/src/runtime/os_nacl.c
new file mode 100644
index 000000000..14b558303
--- /dev/null
+++ b/src/runtime/os_nacl.c
@@ -0,0 +1,312 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "arch_GOARCH.h"
+#include "textflag.h"
+#include "stack.h"
+
+int8 *goos = "nacl";
+extern SigTab runtime·sigtab[];
+
+void runtime·sigtramp(void);
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	int32 ret;
+
+	// Initialize signal handling
+	ret = runtime·nacl_exception_stack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	if(ret < 0)
+		runtime·printf("runtime: nacl_exception_stack: error %d\n", -ret);
+
+	ret = runtime·nacl_exception_handler(runtime·sigtramp, nil);
+	if(ret < 0)
+		runtime·printf("runtime: nacl_exception_handler: error %d\n", -ret);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+}
+
+int8 runtime·sigtrampf[] = "runtime: signal at PC=%X AX=%X CX=%X DX=%X BX=%X DI=%X R15=%X *SP=%X\n";
+int8 runtime·sigtrampp[] = "runtime: sigtramp";
+
+extern byte runtime·tls0[];
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = 1;
+	g->m->procid = 2;
+//runtime·nacl_exception_handler(runtime·sigtramp, nil);
+}
+
+void
+runtime·crash(void)
+{
+	*(int32*)0 = 0;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	*rnd = nil;
+	*rnd_len = 0;
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+void
+runtime·initsig(void)
+{
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·usleep(uint32 us)
+{
+	Timespec ts;
+	
+	ts.tv_sec = us/1000000;
+	ts.tv_nsec = (us%1000000)*1000;
+	runtime·nacl_nanosleep(&ts, nil);
+}
+
+void runtime·mstart_nacl(void);
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	int32 ret;
+	void **tls;
+
+	tls = (void**)mp->tls;
+	tls[0] = mp->g0;
+	tls[1] = mp;
+	ret = runtime·nacl_thread_create(runtime·mstart_nacl, stk, tls+2, 0);
+	if(ret < 0) {
+		runtime·printf("nacl_thread_create: error %d\n", -ret);
+		runtime·throw("newosproc");
+	}
+}
+
+static void
+semacreate(void)
+{
+	int32 mu, cond;
+	
+	mu = runtime·nacl_mutex_create(0);
+	if(mu < 0) {
+		runtime·printf("nacl_mutex_create: error %d\n", -mu);
+		runtime·throw("semacreate");
+	}
+	cond = runtime·nacl_cond_create(0);
+	if(cond < 0) {
+		runtime·printf("nacl_cond_create: error %d\n", -cond);
+		runtime·throw("semacreate");
+	}
+	g->m->waitsemalock = mu;
+	g->m->scalararg[0] = cond; // assigned to m->waitsema
+}
+
+#pragma textflag NOSPLIT
+uint32
+runtime·semacreate(void)
+{
+	void (*fn)(void);
+	uint32 x;
+	
+	fn = semacreate;
+	runtime·onM(&fn);
+	x = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	return x;
+}
+
+static void
+semasleep(void)
+{
+	int32 ret;
+	int64 ns;
+	
+	ns = (int64)(uint32)g->m->scalararg[0] | (int64)(uint32)g->m->scalararg[1]<<32;
+	g->m->scalararg[0] = 0;
+	g->m->scalararg[1] = 0;
+	
+	ret = runtime·nacl_mutex_lock(g->m->waitsemalock);
+	if(ret < 0) {
+		//runtime·printf("nacl_mutex_lock: error %d\n", -ret);
+		runtime·throw("semasleep");
+	}
+	if(g->m->waitsemacount > 0) {
+		g->m->waitsemacount = 0;
+		runtime·nacl_mutex_unlock(g->m->waitsemalock);
+		g->m->scalararg[0] = 0;
+		return;
+	}
+
+	while(g->m->waitsemacount == 0) {
+		if(ns < 0) {
+			ret = runtime·nacl_cond_wait(g->m->waitsema, g->m->waitsemalock);
+			if(ret < 0) {
+				//runtime·printf("nacl_cond_wait: error %d\n", -ret);
+				runtime·throw("semasleep");
+			}
+		} else {
+			Timespec ts;
+			
+			ns += runtime·nanotime();
+			ts.tv_sec = runtime·timediv(ns, 1000000000, (int32*)&ts.tv_nsec);
+			ret = runtime·nacl_cond_timed_wait_abs(g->m->waitsema, g->m->waitsemalock, &ts);
+			if(ret == -ETIMEDOUT) {
+				runtime·nacl_mutex_unlock(g->m->waitsemalock);
+				g->m->scalararg[0] = -1;
+				return;
+			}
+			if(ret < 0) {
+				//runtime·printf("nacl_cond_timed_wait_abs: error %d\n", -ret);
+				runtime·throw("semasleep");
+			}
+		}
+	}
+			
+	g->m->waitsemacount = 0;
+	runtime·nacl_mutex_unlock(g->m->waitsemalock);
+	g->m->scalararg[0] = 0;
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	int32 r;
+	void (*fn)(void);
+
+	g->m->scalararg[0] = (uint32)ns;
+	g->m->scalararg[1] = (uint32)(ns>>32);
+	fn = semasleep;
+	runtime·onM(&fn);
+	r = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	return r;
+}
+
+static void
+semawakeup(void)
+{
+	int32 ret;
+	M *mp;
+	
+	mp = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+
+	ret = runtime·nacl_mutex_lock(mp->waitsemalock);
+	if(ret < 0) {
+		//runtime·printf("nacl_mutex_lock: error %d\n", -ret);
+		runtime·throw("semawakeup");
+	}
+	if(mp->waitsemacount != 0) {
+		//runtime·printf("semawakeup: double wakeup\n");
+		runtime·throw("semawakeup");
+	}
+	mp->waitsemacount = 1;
+	runtime·nacl_cond_signal(mp->waitsema);
+	runtime·nacl_mutex_unlock(mp->waitsemalock);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	void (*fn)(void);
+
+	g->m->ptrarg[0] = mp;
+	fn = semawakeup;
+	runtime·onM(&fn);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	runtime·printf("memlimit\n");
+	return 0;
+}
+
+#pragma dataflag NOPTR
+static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+
+// This runs on a foreign stack, without an m or a g.  No stack split.
+#pragma textflag NOSPLIT
+void
+runtime·badsignal2(void)
+{
+	runtime·write(2, badsignal, sizeof badsignal - 1);
+	runtime·exit(2);
+}
+
+void	runtime·madvise(byte*, uintptr, int32) { }
+void runtime·munmap(byte*, uintptr) {}
+
+void
+runtime·resetcpuprofiler(int32 hz)
+{
+	USED(hz);
+}
+
+void
+runtime·sigdisable(uint32)
+{
+}
+
+void
+runtime·sigenable(uint32)
+{
+}
+
+void
+runtime·closeonexec(int32)
+{
+}
+
+uint32 runtime·writelock; // test-and-set spin lock for runtime.write
+
+/*
+An attempt at IRT. Doesn't work. See end of sys_nacl_amd64.s.
+
+void (*runtime·nacl_irt_query)(void);
+
+int8 runtime·nacl_irt_basic_v0_1_str[] = "nacl-irt-basic-0.1";
+void *runtime·nacl_irt_basic_v0_1[6]; // exit, gettod, clock, nanosleep, sched_yield, sysconf
+int32 runtime·nacl_irt_basic_v0_1_size = sizeof(runtime·nacl_irt_basic_v0_1);
+
+int8 runtime·nacl_irt_memory_v0_3_str[] = "nacl-irt-memory-0.3";
+void *runtime·nacl_irt_memory_v0_3[3]; // mmap, munmap, mprotect
+int32 runtime·nacl_irt_memory_v0_3_size = sizeof(runtime·nacl_irt_memory_v0_3);
+
+int8 runtime·nacl_irt_thread_v0_1_str[] = "nacl-irt-thread-0.1";
+void *runtime·nacl_irt_thread_v0_1[3]; // thread_create, thread_exit, thread_nice
+int32 runtime·nacl_irt_thread_v0_1_size = sizeof(runtime·nacl_irt_thread_v0_1);
+*/
diff --git a/src/runtime/os_nacl.go b/src/runtime/os_nacl.go
new file mode 100644
index 000000000..8dd43ff06
--- /dev/null
+++ b/src/runtime/os_nacl.go
@@ -0,0 +1,39 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func nacl_exception_stack(p unsafe.Pointer, size int32) int32
+func nacl_exception_handler(fn, arg unsafe.Pointer) int32
+func nacl_sem_create(flag int32) int32
+func nacl_sem_wait(sem int32) int32
+func nacl_sem_post(sem int32) int32
+func nacl_mutex_create(flag int32) int32
+func nacl_mutex_lock(mutex int32) int32
+func nacl_mutex_trylock(mutex int32) int32
+func nacl_mutex_unlock(mutex int32) int32
+func nacl_cond_create(flag int32) int32
+func nacl_cond_wait(cond, n int32) int32
+func nacl_cond_signal(cond int32) int32
+func nacl_cond_broadcast(cond int32) int32
+func nacl_cond_timed_wait_abs(cond, lock int32, ts unsafe.Pointer) int32
+func nacl_thread_create(fn, stk, tls, xx unsafe.Pointer) int32
+func nacl_nanosleep(ts, extra unsafe.Pointer) int32
+
+func os_sigpipe() {
+	gothrow("too many writes on closed pipe")
+}
+
+func sigpanic() {
+	g := getg()
+	if !canpanic(g) {
+		gothrow("unexpected signal during runtime execution")
+	}
+
+	// Native Client only invokes the exception handler for memory faults.
+	g.sig = _SIGSEGV
+	panicmem()
+}
diff --git a/src/runtime/os_nacl.h b/src/runtime/os_nacl.h
new file mode 100644
index 000000000..7c9d9c242
--- /dev/null
+++ b/src/runtime/os_nacl.h
@@ -0,0 +1,162 @@
+enum {
+	NSIG = 32,
+	SI_USER = 1,
+
+	// native_client/src/trusted/service_runtime/include/sys/errno.h
+	// The errors are mainly copied from Linux.
+	EPERM = 1,  /* Operation not permitted */
+	ENOENT = 2,  /* No such file or directory */
+	ESRCH = 3,  /* No such process */
+	EINTR = 4,  /* Interrupted system call */
+	EIO = 5,  /* I/O error */
+	ENXIO = 6,  /* No such device or address */
+	E2BIG = 7,  /* Argument list too long */
+	ENOEXEC = 8,  /* Exec format error */
+	EBADF = 9,  /* Bad file number */
+	ECHILD = 10,  /* No child processes */
+	EAGAIN = 11,  /* Try again */
+	ENOMEM = 12,  /* Out of memory */
+	EACCES = 13,  /* Permission denied */
+	EFAULT = 14,  /* Bad address */
+	EBUSY = 16,  /* Device or resource busy */
+	EEXIST = 17,  /* File exists */
+	EXDEV = 18,  /* Cross-device link */
+	ENODEV = 19,  /* No such device */
+	ENOTDIR = 20,  /* Not a directory */
+	EISDIR = 21,  /* Is a directory */
+	EINVAL = 22,  /* Invalid argument */
+	ENFILE = 23,  /* File table overflow */
+	EMFILE = 24,  /* Too many open files */
+	ENOTTY = 25,  /* Not a typewriter */
+	EFBIG = 27,  /* File too large */
+	ENOSPC = 28,  /* No space left on device */
+	ESPIPE = 29,  /* Illegal seek */
+	EROFS = 30,  /* Read-only file system */
+	EMLINK = 31,  /* Too many links */
+	EPIPE = 32,  /* Broken pipe */
+	ENAMETOOLONG = 36,  /* File name too long */
+	ENOSYS = 38,  /* Function not implemented */
+	EDQUOT = 122, /* Quota exceeded */
+	EDOM = 33,   /* Math arg out of domain of func */
+	ERANGE = 34, /* Math result not representable */
+	EDEADLK = 35,  /* Deadlock condition */
+	ENOLCK = 37, /* No record locks available */
+	ENOTEMPTY = 39,  /* Directory not empty */
+	ELOOP = 40,  /* Too many symbolic links */
+	ENOMSG = 42, /* No message of desired type */
+	EIDRM = 43,  /* Identifier removed */
+	ECHRNG = 44, /* Channel number out of range */
+	EL2NSYNC = 45, /* Level 2 not synchronized */
+	EL3HLT = 46, /* Level 3 halted */
+	EL3RST = 47, /* Level 3 reset */
+	ELNRNG = 48, /* Link number out of range */
+	EUNATCH = 49,  /* Protocol driver not attached */
+	ENOCSI = 50, /* No CSI structure available */
+	EL2HLT = 51, /* Level 2 halted */
+	EBADE = 52,  /* Invalid exchange */
+	EBADR = 53,  /* Invalid request descriptor */
+	EXFULL = 54, /* Exchange full */
+	ENOANO = 55, /* No anode */
+	EBADRQC = 56,  /* Invalid request code */
+	EBADSLT = 57,  /* Invalid slot */
+	EDEADLOCK = EDEADLK,  /* File locking deadlock error */
+	EBFONT = 59, /* Bad font file fmt */
+	ENOSTR = 60, /* Device not a stream */
+	ENODATA = 61,  /* No data (for no delay io) */
+	ETIME = 62,  /* Timer expired */
+	ENOSR = 63,  /* Out of streams resources */
+	ENONET = 64, /* Machine is not on the network */
+	ENOPKG = 65, /* Package not installed */
+	EREMOTE = 66,  /* The object is remote */
+	ENOLINK = 67,  /* The link has been severed */
+	EADV = 68,   /* Advertise error */
+	ESRMNT = 69, /* Srmount error */
+	ECOMM = 70,  /* Communication error on send */
+	EPROTO = 71, /* Protocol error */
+	EMULTIHOP = 72,  /* Multihop attempted */
+	EDOTDOT = 73,  /* Cross mount point (not really error) */
+	EBADMSG = 74,  /* Trying to read unreadable message */
+	EOVERFLOW = 75, /* Value too large for defined data type */
+	ENOTUNIQ = 76, /* Given log. name not unique */
+	EBADFD = 77, /* f.d. invalid for this operation */
+	EREMCHG = 78,  /* Remote address changed */
+	ELIBACC = 79,  /* Can't access a needed shared lib */
+	ELIBBAD = 80,  /* Accessing a corrupted shared lib */
+	ELIBSCN = 81,  /* .lib section in a.out corrupted */
+	ELIBMAX = 82,  /* Attempting to link in too many libs */
+	ELIBEXEC = 83, /* Attempting to exec a shared library */
+	EILSEQ = 84,
+	EUSERS = 87,
+	ENOTSOCK = 88,  /* Socket operation on non-socket */
+	EDESTADDRREQ = 89,  /* Destination address required */
+	EMSGSIZE = 90,    /* Message too long */
+	EPROTOTYPE = 91,  /* Protocol wrong type for socket */
+	ENOPROTOOPT = 92, /* Protocol not available */
+	EPROTONOSUPPORT = 93, /* Unknown protocol */
+	ESOCKTNOSUPPORT = 94, /* Socket type not supported */
+	EOPNOTSUPP = 95, /* Operation not supported on transport endpoint */
+	EPFNOSUPPORT = 96, /* Protocol family not supported */
+	EAFNOSUPPORT = 97, /* Address family not supported by protocol family */
+	EADDRINUSE = 98,    /* Address already in use */
+	EADDRNOTAVAIL = 99, /* Address not available */
+	ENETDOWN = 100,    /* Network interface is not configured */
+	ENETUNREACH = 101,   /* Network is unreachable */
+	ENETRESET = 102,
+	ECONNABORTED = 103,  /* Connection aborted */
+	ECONNRESET = 104,  /* Connection reset by peer */
+	ENOBUFS = 105, /* No buffer space available */
+	EISCONN = 106,   /* Socket is already connected */
+	ENOTCONN = 107,    /* Socket is not connected */
+	ESHUTDOWN = 108, /* Can't send after socket shutdown */
+	ETOOMANYREFS = 109,
+	ETIMEDOUT = 110,   /* Connection timed out */
+	ECONNREFUSED = 111,  /* Connection refused */
+	EHOSTDOWN = 112,   /* Host is down */
+	EHOSTUNREACH = 113,  /* Host is unreachable */
+	EALREADY = 114,    /* Socket already connected */
+	EINPROGRESS = 115,   /* Connection already in progress */
+	ESTALE = 116,
+	ENOTSUP = EOPNOTSUPP,   /* Not supported */
+	ENOMEDIUM = 123,   /* No medium (in tape drive) */
+	ECANCELED = 125, /* Operation canceled. */
+	ELBIN = 2048,  /* Inode is remote (not really error) */
+	EFTYPE = 2049,  /* Inappropriate file type or format */
+	ENMFILE = 2050,  /* No more files */
+	EPROCLIM = 2051,
+	ENOSHARE = 2052,  /* No such host or network path */
+	ECASECLASH = 2053,  /* Filename exists with different case */
+	EWOULDBLOCK = EAGAIN,      /* Operation would block */
+
+	// native_client/src/trusted/service_runtime/include/bits/mman.h.
+	// NOTE: DO NOT USE native_client/src/shared/imc/nacl_imc_c.h.
+	// Those MAP_*values are different from these.
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_SHARED	= 0x1,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+	MAP_ANON	= 0x20,
+};
+typedef byte* kevent_udata;
+
+int32	runtime·nacl_exception_stack(byte*, int32);
+int32	runtime·nacl_exception_handler(void*, void*);
+int32	runtime·nacl_sem_create(int32);
+int32	runtime·nacl_sem_wait(int32);
+int32	runtime·nacl_sem_post(int32);
+int32	runtime·nacl_mutex_create(int32);
+int32	runtime·nacl_mutex_lock(int32);
+int32	runtime·nacl_mutex_trylock(int32);
+int32	runtime·nacl_mutex_unlock(int32);
+int32	runtime·nacl_cond_create(int32);
+int32	runtime·nacl_cond_wait(int32, int32);
+int32	runtime·nacl_cond_signal(int32);
+int32	runtime·nacl_cond_broadcast(int32);
+int32	runtime·nacl_cond_timed_wait_abs(int32, int32, Timespec*);
+int32	runtime·nacl_thread_create(void*, void*, void*, void*);
+int32	runtime·nacl_nanosleep(Timespec*, Timespec*);
+
+void	runtime·sigpanic(void);
diff --git a/src/runtime/os_nacl_arm.c b/src/runtime/os_nacl_arm.c
new file mode 100644
index 000000000..1248ea644
--- /dev/null
+++ b/src/runtime/os_nacl_arm.c
@@ -0,0 +1,24 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "textflag.h"
+
+void
+runtime·checkgoarm(void)
+{
+	return; // NaCl/ARM only supports ARMv7
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·cputicks(void)
+{
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// TODO: need more entropy to better seed fastrand1.
+	return runtime·nanotime();
+}
diff --git a/src/runtime/os_netbsd.c b/src/runtime/os_netbsd.c
new file mode 100644
index 000000000..58e5bedf2
--- /dev/null
+++ b/src/runtime/os_netbsd.c
@@ -0,0 +1,368 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+enum
+{
+	ESRCH = 3,
+	ENOTSUP = 91,
+
+	// From NetBSD's <sys/time.h>
+	CLOCK_REALTIME = 0,
+	CLOCK_VIRTUAL = 1,
+	CLOCK_PROF = 2,
+	CLOCK_MONOTONIC = 3
+};
+
+extern SigTab runtime·sigtab[];
+
+static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0, ~(uint32)0, ~(uint32)0, };
+
+extern void runtime·getcontext(UcontextT *context);
+extern int32 runtime·lwp_create(UcontextT *context, uintptr flags, void *lwpid);
+extern void runtime·lwp_mcontext_init(void *mc, void *stack, M *mp, G *gp, void (*fn)(void));
+extern int32 runtime·lwp_park(Timespec *abstime, int32 unpark, void *hint, void *unparkhint);
+extern int32 runtime·lwp_unpark(int32 lwp, void *hint);
+extern int32 runtime·lwp_self(void);
+
+// From NetBSD's <sys/sysctl.h>
+#define	CTL_HW	6
+#define	HW_NCPU	3
+
+static int32
+getncpu(void)
+{
+	uint32 mib[2];
+	uint32 out;
+	int32 ret;
+	uintptr nout;
+
+	// Fetch hw.ncpu via sysctl.
+	mib[0] = CTL_HW;
+	mib[1] = HW_NCPU;
+	nout = sizeof out;
+	out = 0;
+	ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
+	if(ret >= 0)
+		return out;
+	else
+		return 1;
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·semacreate(void)
+{
+	return 1;
+}
+
+static void
+semasleep(void)
+{
+	int64 ns;
+	Timespec ts;
+
+	ns = (int64)(uint32)g->m->scalararg[0] | (int64)(uint32)g->m->scalararg[1]<<32;
+	g->m->scalararg[0] = 0;
+	g->m->scalararg[1] = 0;
+
+	// spin-mutex lock
+	while(runtime·xchg(&g->m->waitsemalock, 1))
+		runtime·osyield();
+
+	for(;;) {
+		// lock held
+		if(g->m->waitsemacount == 0) {
+			// sleep until semaphore != 0 or timeout.
+			// thrsleep unlocks m->waitsemalock.
+			if(ns < 0) {
+				// TODO(jsing) - potential deadlock!
+				//
+				// There is a potential deadlock here since we
+				// have to release the waitsemalock mutex
+				// before we call lwp_park() to suspend the
+				// thread. This allows another thread to
+				// release the lock and call lwp_unpark()
+				// before the thread is actually suspended.
+				// If this occurs the current thread will end
+				// up sleeping indefinitely. Unfortunately
+				// the NetBSD kernel does not appear to provide
+				// a mechanism for unlocking the userspace
+				// mutex once the thread is actually parked.
+				runtime·atomicstore(&g->m->waitsemalock, 0);
+				runtime·lwp_park(nil, 0, &g->m->waitsemacount, nil);
+			} else {
+				ns = ns + runtime·nanotime();
+				// NOTE: tv_nsec is int64 on amd64, so this assumes a little-endian system.
+				ts.tv_nsec = 0;
+				ts.tv_sec = runtime·timediv(ns, 1000000000, (int32*)&ts.tv_nsec);
+				// TODO(jsing) - potential deadlock!
+				// See above for details.
+				runtime·atomicstore(&g->m->waitsemalock, 0);
+				runtime·lwp_park(&ts, 0, &g->m->waitsemacount, nil);
+			}
+			// reacquire lock
+			while(runtime·xchg(&g->m->waitsemalock, 1))
+				runtime·osyield();
+		}
+
+		// lock held (again)
+		if(g->m->waitsemacount != 0) {
+			// semaphore is available.
+			g->m->waitsemacount--;
+			// spin-mutex unlock
+			runtime·atomicstore(&g->m->waitsemalock, 0);
+			g->m->scalararg[0] = 0; // semaphore acquired
+			return;
+		}
+
+		// semaphore not available.
+		// if there is a timeout, stop now.
+		// otherwise keep trying.
+		if(ns >= 0)
+			break;
+	}
+
+	// lock held but giving up
+	// spin-mutex unlock
+	runtime·atomicstore(&g->m->waitsemalock, 0);
+	g->m->scalararg[0] = -1;
+	return;
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	int32 r;
+	void (*fn)(void);
+
+	g->m->scalararg[0] = (uint32)ns;
+	g->m->scalararg[1] = (uint32)(ns>>32);
+	fn = semasleep;
+	runtime·onM(&fn);
+	r = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	return r;
+}
+
+static void badsemawakeup(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	uint32 ret;
+	void (*fn)(void);
+	void *oldptr;
+	uintptr oldscalar;
+
+	// spin-mutex lock
+	while(runtime·xchg(&mp->waitsemalock, 1))
+		runtime·osyield();
+	mp->waitsemacount++;
+	// TODO(jsing) - potential deadlock, see semasleep() for details.
+	// Confirm that LWP is parked before unparking...
+	ret = runtime·lwp_unpark(mp->procid, &mp->waitsemacount);
+	if(ret != 0 && ret != ESRCH) {
+		// semawakeup can be called on signal stack.
+		// Save old ptrarg/scalararg so we can restore them.
+		oldptr = g->m->ptrarg[0];
+		oldscalar = g->m->scalararg[0];
+		g->m->ptrarg[0] = mp;
+		g->m->scalararg[0] = ret;
+		fn = badsemawakeup;
+		if(g == g->m->gsignal)
+			fn();
+		else
+			runtime·onM(&fn);
+		g->m->ptrarg[0] = oldptr;
+		g->m->scalararg[0] = oldscalar;
+	}
+	// spin-mutex unlock
+	runtime·atomicstore(&mp->waitsemalock, 0);
+}
+
+static void
+badsemawakeup(void)
+{
+	M *mp;
+	int32 ret;
+
+	mp = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	ret = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+
+	runtime·printf("thrwakeup addr=%p sem=%d ret=%d\n", &mp->waitsemacount, mp->waitsemacount, ret);
+}
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	UcontextT uc;
+	int32 ret;
+
+	if(0) {
+		runtime·printf(
+			"newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
+	}
+
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+
+	runtime·getcontext(&uc);
+	
+	uc.uc_flags = _UC_SIGMASK | _UC_CPU;
+	uc.uc_link = nil;
+	uc.uc_sigmask = sigset_all;
+
+	runtime·lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp->g0, runtime·mstart);
+
+	ret = runtime·lwp_create(&uc, 0, &mp->procid);
+
+	if(ret < 0) {
+		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount() - 1, -ret);
+		runtime·throw("runtime.newosproc");
+	}
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getncpu();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	#pragma dataflag NOPTR
+	static byte urandom_data[HashRandomBytes];
+	int32 fd;
+	fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = urandom_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	g->m->procid = runtime·lwp_self();
+
+	// Initialize signal handling
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	return 0;
+}
+
+extern void runtime·sigtramp(void);
+
+typedef struct sigaction {
+	union {
+		void    (*_sa_handler)(int32);
+		void    (*_sa_sigaction)(int32, Siginfo*, void *);
+	} _sa_u;			/* signal handler */
+	uint32	sa_mask[4];		/* signal mask to apply */
+	int32	sa_flags;		/* see signal options below */
+} SigactionT;
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask[0] = ~0U;
+	sa.sa_mask[1] = ~0U;
+	sa.sa_mask[2] = ~0U;
+	sa.sa_mask[3] = ~0U;
+	if (fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa._sa_u._sa_sigaction = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	runtime·sigaction(i, nil, &sa);
+	if((void*)sa._sa_u._sa_sigaction == runtime·sigtramp)
+		return runtime·sighandler;
+	return (void*)sa._sa_u._sa_sigaction;
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	StackT st;
+
+	st.ss_sp = (void*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
new file mode 100644
index 000000000..f000c5e9f
--- /dev/null
+++ b/src/runtime/os_netbsd.go
@@ -0,0 +1,20 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func setitimer(mode int32, new, old unsafe.Pointer)
+func sigaction(sig int32, new, old unsafe.Pointer)
+func sigaltstack(new, old unsafe.Pointer)
+func sigprocmask(mode int32, new, old unsafe.Pointer)
+func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+func lwp_tramp()
+func raise(sig int32)
+func getcontext(ctxt unsafe.Pointer)
+func lwp_create(ctxt unsafe.Pointer, flags uintptr, lwpid unsafe.Pointer) int32
+func lwp_park(abstime unsafe.Pointer, unpark int32, hint, unparkhint unsafe.Pointer) int32
+func lwp_unpark(lwp int32, hint unsafe.Pointer) int32
+func lwp_self() int32
diff --git a/src/runtime/os_netbsd.h b/src/runtime/os_netbsd.h
new file mode 100644
index 000000000..f95db325f
--- /dev/null
+++ b/src/runtime/os_netbsd.h
@@ -0,0 +1,31 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+typedef uintptr kevent_udata;
+
+struct sigaction;
+
+void	runtime·sigpanic(void);
+
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
+void	runtime·sigaltstack(SigaltstackT*, SigaltstackT*);
+void	runtime·sigprocmask(int32, Sigset*, Sigset*);
+void	runtime·unblocksignals(void);
+int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
+extern void runtime·lwp_tramp(void);
+
+enum {
+	SS_DISABLE = 4,
+	SIG_BLOCK = 1,
+	SIG_UNBLOCK = 2,
+	SIG_SETMASK = 3,
+	NSIG = 33,
+	SI_USER = 0,
+
+	// From NetBSD's <sys/ucontext.h>
+	_UC_SIGMASK = 0x01,
+	_UC_CPU = 0x04,
+};
diff --git a/src/runtime/os_netbsd_386.c b/src/runtime/os_netbsd_386.c
new file mode 100644
index 000000000..23e9db3c1
--- /dev/null
+++ b/src/runtime/os_netbsd_386.c
@@ -0,0 +1,17 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+void
+runtime·lwp_mcontext_init(McontextT *mc, void *stack, M *mp, G *gp, void (*fn)(void))
+{
+	mc->__gregs[REG_EIP] = (uint32)runtime·lwp_tramp;
+	mc->__gregs[REG_UESP] = (uint32)stack;
+	mc->__gregs[REG_EBX] = (uint32)mp;
+	mc->__gregs[REG_EDX] = (uint32)gp;
+	mc->__gregs[REG_ESI] = (uint32)fn;
+}
diff --git a/src/runtime/os_netbsd_amd64.c b/src/runtime/os_netbsd_amd64.c
new file mode 100644
index 000000000..226846cbb
--- /dev/null
+++ b/src/runtime/os_netbsd_amd64.c
@@ -0,0 +1,18 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+void
+runtime·lwp_mcontext_init(McontextT *mc, void *stack, M *mp, G *gp, void (*fn)(void))
+{
+	// Machine dependent mcontext initialisation for LWP.
+	mc->__gregs[REG_RIP] = (uint64)runtime·lwp_tramp;
+	mc->__gregs[REG_RSP] = (uint64)stack;
+	mc->__gregs[REG_R8] = (uint64)mp;
+	mc->__gregs[REG_R9] = (uint64)gp;
+	mc->__gregs[REG_R12] = (uint64)fn;
+}
diff --git a/src/runtime/os_netbsd_arm.c b/src/runtime/os_netbsd_arm.c
new file mode 100644
index 000000000..9dd4bcdc9
--- /dev/null
+++ b/src/runtime/os_netbsd_arm.c
@@ -0,0 +1,34 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_GOOS_GOARCH.h"
+#include "textflag.h"
+
+void
+runtime·lwp_mcontext_init(McontextT *mc, void *stack, M *mp, G *gp, void (*fn)(void))
+{
+	mc->__gregs[REG_R15] = (uint32)runtime·lwp_tramp;
+	mc->__gregs[REG_R13] = (uint32)stack;
+	mc->__gregs[REG_R0] = (uint32)mp;
+	mc->__gregs[REG_R1] = (uint32)gp;
+	mc->__gregs[REG_R2] = (uint32)fn;
+}
+
+void
+runtime·checkgoarm(void)
+{
+	// TODO(minux)
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·cputicks() {
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// TODO: need more entropy to better seed fastrand1.
+	return runtime·nanotime();
+}
diff --git a/src/runtime/os_openbsd.c b/src/runtime/os_openbsd.c
new file mode 100644
index 000000000..eebaa13ee
--- /dev/null
+++ b/src/runtime/os_openbsd.c
@@ -0,0 +1,309 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+enum
+{
+	ESRCH = 3,
+	EAGAIN = 35,
+	EWOULDBLOCK = EAGAIN,
+	ENOTSUP = 91,
+
+	// From OpenBSD's sys/time.h
+	CLOCK_REALTIME = 0,
+	CLOCK_VIRTUAL = 1,
+	CLOCK_PROF = 2,
+	CLOCK_MONOTONIC = 3
+};
+
+extern SigTab runtime·sigtab[];
+
+static Sigset sigset_none;
+static Sigset sigset_all = ~(Sigset)0;
+
+extern int32 runtime·tfork(TforkT *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+extern int32 runtime·thrsleep(void *ident, int32 clock_id, void *tsp, void *lock, const int32 *abort);
+extern int32 runtime·thrwakeup(void *ident, int32 n);
+
+// From OpenBSD's <sys/sysctl.h>
+#define	CTL_HW	6
+#define	HW_NCPU	3
+
+static int32
+getncpu(void)
+{
+	uint32 mib[2];
+	uint32 out;
+	int32 ret;
+	uintptr nout;
+
+	// Fetch hw.ncpu via sysctl.
+	mib[0] = CTL_HW;
+	mib[1] = HW_NCPU;
+	nout = sizeof out;
+	out = 0;
+	ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0);
+	if(ret >= 0)
+		return out;
+	else
+		return 1;
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·semacreate(void)
+{
+	return 1;
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	Timespec ts, *tsp = nil;
+
+	// Compute sleep deadline.
+	if(ns >= 0) {
+		int32 nsec;
+		ns += runtime·nanotime();
+		ts.tv_sec = runtime·timediv(ns, 1000000000, &nsec);
+		ts.tv_nsec = nsec; // tv_nsec is int64 on amd64
+		tsp = &ts;
+	}
+
+	for(;;) {
+		int32 ret;
+
+		// spin-mutex lock
+		while(runtime·xchg(&g->m->waitsemalock, 1))
+			runtime·osyield();
+
+		if(g->m->waitsemacount != 0) {
+			// semaphore is available.
+			g->m->waitsemacount--;
+			// spin-mutex unlock
+			runtime·atomicstore(&g->m->waitsemalock, 0);
+			return 0;  // semaphore acquired
+		}
+
+		// sleep until semaphore != 0 or timeout.
+		// thrsleep unlocks m->waitsemalock.
+		ret = runtime·thrsleep(&g->m->waitsemacount, CLOCK_MONOTONIC, tsp, &g->m->waitsemalock, (int32 *)&g->m->waitsemacount);
+		if(ret == EWOULDBLOCK)
+			return -1;
+	}
+}
+
+static void badsemawakeup(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	uint32 ret;
+	void *oldptr;
+	uint32 oldscalar;
+	void (*fn)(void);
+
+	// spin-mutex lock
+	while(runtime·xchg(&mp->waitsemalock, 1))
+		runtime·osyield();
+	mp->waitsemacount++;
+	ret = runtime·thrwakeup(&mp->waitsemacount, 1);
+	if(ret != 0 && ret != ESRCH) {
+		// semawakeup can be called on signal stack.
+		// Save old ptrarg/scalararg so we can restore them.
+		oldptr = g->m->ptrarg[0];
+		oldscalar = g->m->scalararg[0];
+		g->m->ptrarg[0] = mp;
+		g->m->scalararg[0] = ret;
+		fn = badsemawakeup;
+		if(g == g->m->gsignal)
+			fn();
+		else
+			runtime·onM(&fn);
+		g->m->ptrarg[0] = oldptr;
+		g->m->scalararg[0] = oldscalar;
+	}
+	// spin-mutex unlock
+	runtime·atomicstore(&mp->waitsemalock, 0);
+}
+
+static void
+badsemawakeup(void)
+{
+	M *mp;
+	int32 ret;
+
+	mp = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	ret = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+
+	runtime·printf("thrwakeup addr=%p sem=%d ret=%d\n", &mp->waitsemacount, mp->waitsemacount, ret);
+}
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	TforkT param;
+	Sigset oset;
+	int32 ret;
+
+	if(0) {
+		runtime·printf(
+			"newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
+	}
+
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+
+	param.tf_tcb = (byte*)&mp->tls[0];
+	param.tf_tid = (int32*)&mp->procid;
+	param.tf_stack = stk;
+
+	oset = runtime·sigprocmask(SIG_SETMASK, sigset_all);
+	ret = runtime·tfork(&param, sizeof(param), mp, mp->g0, runtime·mstart);
+	runtime·sigprocmask(SIG_SETMASK, oset);
+
+	if(ret < 0) {
+		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount() - 1, -ret);
+		if (ret == -ENOTSUP)
+			runtime·printf("runtime: is kern.rthreads disabled?\n");
+		runtime·throw("runtime.newosproc");
+	}
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getncpu();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	#pragma dataflag NOPTR
+	static byte urandom_data[HashRandomBytes];
+	int32 fd;
+	fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = urandom_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	// Initialize signal handling
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	runtime·sigprocmask(SIG_SETMASK, sigset_none);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	return 0;
+}
+
+extern void runtime·sigtramp(void);
+
+typedef struct sigaction {
+	union {
+		void    (*__sa_handler)(int32);
+		void    (*__sa_sigaction)(int32, Siginfo*, void *);
+	} __sigaction_u;		/* signal handler */
+	uint32	sa_mask;		/* signal mask to apply */
+	int32	sa_flags;		/* see signal options below */
+} SigactionT;
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask = ~0U;
+	if(fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa.__sigaction_u.__sa_sigaction = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	runtime·sigaction(i, nil, &sa);
+	if((void*)sa.__sigaction_u.__sa_sigaction == runtime·sigtramp)
+		return runtime·sighandler;
+	return (void*)sa.__sigaction_u.__sa_sigaction;
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	StackT st;
+
+	st.ss_sp = (void*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·sigprocmask(SIG_SETMASK, sigset_none);
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_openbsd.go b/src/runtime/os_openbsd.go
new file mode 100644
index 000000000..a000f963e
--- /dev/null
+++ b/src/runtime/os_openbsd.go
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func setitimer(mode int32, new, old unsafe.Pointer)
+func sigaction(sig int32, new, old unsafe.Pointer)
+func sigaltstack(new, old unsafe.Pointer)
+func sigprocmask(mode int32, new uint32) uint32
+func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+func raise(sig int32)
+func tfork(param unsafe.Pointer, psize uintptr, mm, gg, fn unsafe.Pointer) int32
+func thrsleep(ident unsafe.Pointer, clock_id int32, tsp, lock, abort unsafe.Pointer) int32
+func thrwakeup(ident unsafe.Pointer, n int32) int32
diff --git a/src/runtime/os_openbsd.h b/src/runtime/os_openbsd.h
new file mode 100644
index 000000000..6ad98109e
--- /dev/null
+++ b/src/runtime/os_openbsd.h
@@ -0,0 +1,26 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+typedef byte* kevent_udata;
+
+struct sigaction;
+
+void	runtime·sigpanic(void);
+
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
+void	runtime·sigaltstack(SigaltstackT*, SigaltstackT*);
+Sigset	runtime·sigprocmask(int32, Sigset);
+void	runtime·unblocksignals(void);
+int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
+
+enum {
+	SS_DISABLE = 4,
+	SIG_BLOCK = 1,
+	SIG_UNBLOCK = 2,
+	SIG_SETMASK = 3,
+	NSIG = 33,
+	SI_USER = 0,
+};
diff --git a/src/runtime/os_plan9.c b/src/runtime/os_plan9.c
new file mode 100644
index 000000000..f8c543f6f
--- /dev/null
+++ b/src/runtime/os_plan9.c
@@ -0,0 +1,362 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "os_GOOS.h"
+#include "arch_GOARCH.h"
+#include "textflag.h"
+#include "malloc.h"
+
+int8 *goos = "plan9";
+extern SigTab runtime·sigtab[];
+
+int32 runtime·postnote(int32, int8*);
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	// Initialize stack and goroutine for note handling.
+	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
+	mp->notesig = (int8*)runtime·mallocgc(ERRMAX*sizeof(int8), nil, FlagNoScan);
+
+	// Initialize stack for handling strings from the
+	// errstr system call, as used in package syscall.
+	mp->errstr = (byte*)runtime·mallocgc(ERRMAX*sizeof(byte), nil, FlagNoScan);
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	// Mask all SSE floating-point exceptions
+	// when running on the 64-bit kernel.
+	runtime·setfpmasks();
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+}
+
+
+static int32
+getproccount(void)
+{
+	int32 fd, i, n, ncpu;
+	byte buf[2048];
+
+	fd = runtime·open("/dev/sysstat", OREAD, 0);
+	if(fd < 0)
+		return 1;
+	ncpu = 0;
+	for(;;) {
+		n = runtime·read(fd, buf, sizeof buf);
+		if(n <= 0)
+			break;
+		for(i = 0; i < n; i++) {
+			if(buf[i] == '\n')
+				ncpu++;
+		}
+	}
+	runtime·close(fd);
+	return ncpu > 0 ? ncpu : 1;
+}
+
+static int32
+getpid(void)
+{
+	byte b[20], *c;
+	int32 fd;
+
+	runtime·memclr(b, sizeof(b));
+	fd = runtime·open("#c/pid", 0, 0);
+	if(fd >= 0) {
+		runtime·read(fd, b, sizeof(b));
+		runtime·close(fd);
+	}
+	c = b;
+	while(*c == ' ' || *c == '\t')
+		c++;
+	return runtime·atoi(c);
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getproccount();
+	g->m->procid = getpid();
+	runtime·notify(runtime·sigtramp);
+}
+
+void
+runtime·crash(void)
+{
+	runtime·notify(nil);
+	*(int32*)0 = 0;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	static byte random_data[HashRandomBytes];
+	int32 fd;
+
+	fd = runtime·open("/dev/random", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, random_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = random_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+}
+
+void
+runtime·initsig(void)
+{
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·osyield(void)
+{
+	runtime·sleep(0);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·usleep(uint32 µs)
+{
+	uint32 ms;
+
+	ms = µs/1000;
+	if(ms == 0)
+		ms = 1;
+	runtime·sleep(ms);
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·nanotime(void)
+{
+	int64 ns, scratch;
+
+	ns = runtime·nsec(&scratch);
+	// TODO(aram): remove hack after I fix _nsec in the pc64 kernel.
+	if(ns == 0)
+		return scratch;
+	return ns;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·itoa(int32 n, byte *p, uint32 len)
+{
+	byte *q, c;
+	uint32 i;
+
+	if(len <= 1)
+		return;
+
+	runtime·memclr(p, len);
+	q = p;
+
+	if(n==0) {
+		*q++ = '0';
+		USED(q);
+		return;
+	}
+	if(n < 0) {
+		*q++ = '-';
+		p++;
+		n = -n;
+	}
+	for(i=0; n > 0 && i < len; i++) {
+		*q++ = '0' + (n%10);
+		n = n/10;
+	}
+	for(q--; q >= p; ) {
+		c = *p;
+		*p++ = *q;
+		*q-- = c;
+	}
+}
+
+void
+runtime·goexitsall(int8 *status)
+{
+	int8 buf[ERRMAX];
+	M *mp;
+	int32 pid;
+
+	runtime·snprintf((byte*)buf, sizeof buf, "go: exit %s", status);
+	pid = getpid();
+	for(mp=runtime·atomicloadp(&runtime·allm); mp; mp=mp->alllink)
+		if(mp->procid != pid)
+			runtime·postnote(mp->procid, buf);
+}
+
+int32
+runtime·postnote(int32 pid, int8* msg)
+{
+	int32 fd;
+	intgo len;
+	uint8 buf[128];
+	uint8 tmp[16];
+	uint8 *p, *q;
+
+	runtime·memclr(buf, sizeof buf);
+
+	/* build path string /proc/pid/note */
+	q = tmp;
+	p = buf;
+	runtime·itoa(pid, tmp, sizeof tmp);
+	runtime·memmove((void*)p, (void*)"/proc/", 6);
+	for(p += 6; *p++ = *q++; );
+	p--;
+	runtime·memmove((void*)p, (void*)"/note", 5);
+
+	fd = runtime·open((int8*)buf, OWRITE, 0);
+	if(fd < 0)
+		return -1;
+
+	len = runtime·findnull((byte*)msg);
+	if(runtime·write(fd, msg, len) != len) {
+		runtime·close(fd);
+		return -1;
+	}
+	runtime·close(fd);
+	return 0;
+}
+
+static void exit(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·exit(int32 e)
+{
+	void (*fn)(void);
+
+	g->m->scalararg[0] = e;
+	fn = exit;
+	runtime·onM(&fn);
+}
+
+static void
+exit(void)
+{
+	int32 e;
+	byte tmp[16];
+	int8 *status;
+ 
+ 	e = g->m->scalararg[0];
+ 	g->m->scalararg[0] = 0;
+
+	if(e == 0)
+		status = "";
+	else {
+		/* build error string */
+		runtime·itoa(e, tmp, sizeof tmp);
+		status = (int8*)tmp;
+	}
+
+	runtime·goexitsall(status);
+	runtime·exits(status);
+}
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	int32 pid;
+
+	if(0)
+		runtime·printf("newosproc mp=%p ostk=%p\n", mp, &mp);
+
+	USED(stk);
+	if((pid = runtime·rfork(RFPROC|RFMEM|RFNOWAIT)) < 0)
+		runtime·throw("newosproc: rfork failed\n");
+	if(pid == 0)
+		runtime·tstart_plan9(mp);
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·semacreate(void)
+{
+	return 1;
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	int32 ret;
+	int32 ms;
+
+	if(ns >= 0) {
+		ms = runtime·timediv(ns, 1000000, nil);
+		if(ms == 0)
+			ms = 1;
+		ret = runtime·plan9_tsemacquire(&g->m->waitsemacount, ms);
+		if(ret == 1)
+			return 0;  // success
+		return -1;  // timeout or interrupted
+	}
+
+	while(runtime·plan9_semacquire(&g->m->waitsemacount, 1) < 0) {
+		/* interrupted; try again (c.f. lock_sema.c) */
+	}
+	return 0;  // success
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	runtime·plan9_semrelease(&mp->waitsemacount, 1);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·read(int32 fd, void *buf, int32 nbytes)
+{
+	return runtime·pread(fd, buf, nbytes, -1LL);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·write(uintptr fd, void *buf, int32 nbytes)
+{
+	return runtime·pwrite((int32)fd, buf, nbytes, -1LL);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	return 0;
+}
+
+#pragma dataflag NOPTR
+static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+
+// This runs on a foreign stack, without an m or a g.  No stack split.
+#pragma textflag NOSPLIT
+void
+runtime·badsignal2(void)
+{
+	runtime·pwrite(2, badsignal, sizeof badsignal - 1, -1LL);
+	runtime·exits(badsignal);
+}
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
new file mode 100644
index 000000000..20e47bf42
--- /dev/null
+++ b/src/runtime/os_plan9.go
@@ -0,0 +1,103 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func pread(fd int32, buf unsafe.Pointer, nbytes int32, offset int64) int32
+func pwrite(fd int32, buf unsafe.Pointer, nbytes int32, offset int64) int32
+func seek(fd int32, offset int64, whence int32) int64
+func exits(msg *byte)
+func brk_(addr unsafe.Pointer) uintptr
+func sleep(ms int32) int32
+func rfork(flags int32) int32
+func plan9_semacquire(addr *uint32, block int32) int32
+func plan9_tsemacquire(addr *uint32, ms int32) int32
+func plan9_semrelease(addr *uint32, count int32) int32
+func notify(fn unsafe.Pointer) int32
+func noted(mode int32) int32
+func nsec(*int64) int64
+func sigtramp(ureg, msg unsafe.Pointer)
+func setfpmasks()
+func tstart_plan9(newm *m)
+func errstr() string
+
+type _Plink uintptr
+
+func os_sigpipe() {
+	gothrow("too many writes on closed pipe")
+}
+
+func sigpanic() {
+	g := getg()
+	if !canpanic(g) {
+		gothrow("unexpected signal during runtime execution")
+	}
+
+	note := gostringnocopy((*byte)(unsafe.Pointer(g.m.notesig)))
+	switch g.sig {
+	case _SIGRFAULT, _SIGWFAULT:
+		addr := note[index(note, "addr=")+5:]
+		g.sigcode1 = uintptr(atolwhex(addr))
+		if g.sigcode1 < 0x1000 || g.paniconfault {
+			panicmem()
+		}
+		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		gothrow("fault")
+	case _SIGTRAP:
+		if g.paniconfault {
+			panicmem()
+		}
+		gothrow(note)
+	case _SIGINTDIV:
+		panicdivide()
+	case _SIGFLOAT:
+		panicfloat()
+	default:
+		panic(errorString(note))
+	}
+}
+
+func atolwhex(p string) int64 {
+	for hasprefix(p, " ") || hasprefix(p, "\t") {
+		p = p[1:]
+	}
+	neg := false
+	if hasprefix(p, "-") || hasprefix(p, "+") {
+		neg = p[0] == '-'
+		p = p[1:]
+		for hasprefix(p, " ") || hasprefix(p, "\t") {
+			p = p[1:]
+		}
+	}
+	var n int64
+	switch {
+	case hasprefix(p, "0x"), hasprefix(p, "0X"):
+		p = p[2:]
+		for ; len(p) > 0; p = p[1:] {
+			if '0' <= p[0] && p[0] <= '9' {
+				n = n*16 + int64(p[0]-'0')
+			} else if 'a' <= p[0] && p[0] <= 'f' {
+				n = n*16 + int64(p[0]-'a'+10)
+			} else if 'A' <= p[0] && p[0] <= 'F' {
+				n = n*16 + int64(p[0]-'A'+10)
+			} else {
+				break
+			}
+		}
+	case hasprefix(p, "0"):
+		for ; len(p) > 0 && '0' <= p[0] && p[0] <= '7'; p = p[1:] {
+			n = n*8 + int64(p[0]-'0')
+		}
+	default:
+		for ; len(p) > 0 && '0' <= p[0] && p[0] <= '9'; p = p[1:] {
+			n = n*10 + int64(p[0]-'0')
+		}
+	}
+	if neg {
+		n = -n
+	}
+	return n
+}
diff --git a/src/runtime/os_plan9.h b/src/runtime/os_plan9.h
new file mode 100644
index 000000000..6d1802483
--- /dev/null
+++ b/src/runtime/os_plan9.h
@@ -0,0 +1,93 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Plan 9-specific system calls
+int32	runtime·pread(int32 fd, void *buf, int32 nbytes, int64 offset);
+int32	runtime·pwrite(int32 fd, void *buf, int32 nbytes, int64 offset);
+int64	runtime·seek(int32 fd, int64 offset, int32 whence);
+void	runtime·exits(int8* msg);
+intptr	runtime·brk_(void*);
+int32	runtime·sleep(int32 ms);
+int32	runtime·rfork(int32 flags);
+int32	runtime·plan9_semacquire(uint32 *addr, int32 block);
+int32	runtime·plan9_tsemacquire(uint32 *addr, int32 ms);
+int32 	runtime·plan9_semrelease(uint32 *addr, int32 count);
+int32	runtime·notify(void (*fn)(void*, int8*));
+int32	runtime·noted(int32);
+int64	runtime·nsec(int64*);
+void	runtime·sigtramp(void*, int8*);
+void	runtime·sigpanic(void);
+void	runtime·goexitsall(int8*);
+void	runtime·setfpmasks(void);
+void	runtime·tstart_plan9(M *newm);
+
+/* open */
+enum
+{
+	OREAD	= 0,
+	OWRITE	= 1,
+	ORDWR	= 2,
+	OEXEC	= 3,
+	OTRUNC	= 16,
+	OCEXEC	= 32,
+	ORCLOSE	= 64,
+	OEXCL	= 0x1000
+};
+
+/* rfork */
+enum
+{
+	RFNAMEG         = (1<<0),
+	RFENVG          = (1<<1),
+	RFFDG           = (1<<2),
+	RFNOTEG         = (1<<3),
+	RFPROC          = (1<<4),
+	RFMEM           = (1<<5),
+	RFNOWAIT        = (1<<6),
+	RFCNAMEG        = (1<<10),
+	RFCENVG         = (1<<11),
+	RFCFDG          = (1<<12),
+	RFREND          = (1<<13),
+	RFNOMNT         = (1<<14)
+};
+
+/* notify */
+enum
+{
+	NCONT	= 0,
+	NDFLT	= 1
+};
+
+typedef struct Tos Tos;
+typedef intptr _Plink;
+
+struct Tos {
+	struct TosProf			/* Per process profiling */
+	{
+		_Plink	*pp;	/* known to be 0(ptr) */
+		_Plink	*next;	/* known to be 4(ptr) */
+		_Plink	*last;
+		_Plink	*first;
+		uint32	pid;
+		uint32	what;
+	} prof;
+	uint64	cyclefreq;	/* cycle clock frequency if there is one, 0 otherwise */
+	int64	kcycles;	/* cycles spent in kernel */
+	int64	pcycles;	/* cycles spent in process (kernel + user) */
+	uint32	pid;		/* might as well put the pid here */
+	uint32	clock;
+	/* top of stack is here */
+};
+
+enum {
+	NSIG = 14, /* number of signals in runtime·SigTab array */
+	ERRMAX = 128, /* max length of note string */
+
+	/* Notes in runtime·sigtab that are handled by runtime·sigpanic. */
+	SIGRFAULT = 2,
+	SIGWFAULT = 3,
+	SIGINTDIV = 4,
+	SIGFLOAT = 5,
+	SIGTRAP = 6,
+};
diff --git a/src/runtime/os_plan9_386.c b/src/runtime/os_plan9_386.c
new file mode 100644
index 000000000..42c6d161c
--- /dev/null
+++ b/src/runtime/os_plan9_386.c
@@ -0,0 +1,150 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file. 
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Ureg *u)
+{
+	runtime·printf("ax	%x\n", u->ax);
+	runtime·printf("bx	%x\n", u->bx);
+	runtime·printf("cx	%x\n", u->cx);
+	runtime·printf("dx	%x\n", u->dx);
+	runtime·printf("di	%x\n", u->di);
+	runtime·printf("si	%x\n", u->si);
+	runtime·printf("bp	%x\n", u->bp);
+	runtime·printf("sp	%x\n", u->sp);
+	runtime·printf("pc	%x\n", u->pc);
+	runtime·printf("flags	%x\n", u->flags);
+	runtime·printf("cs	%x\n", u->cs);
+	runtime·printf("fs	%x\n", u->fs);
+	runtime·printf("gs	%x\n", u->gs);
+}
+
+int32
+runtime·sighandler(void *v, int8 *note, G *gp)
+{
+	uintptr *sp;
+	SigTab *t;
+	bool crash;
+	Ureg *ureg;
+	intgo len, n;
+	int32 sig, flags;
+
+	ureg = (Ureg*)v;
+
+	// The kernel will never pass us a nil note or ureg so we probably
+	// made a mistake somewhere in runtime·sigtramp.
+	if(ureg == nil || note == nil) {
+		runtime·printf("sighandler: ureg %p note %p\n", ureg, note);
+		goto Throw;
+	}
+
+	// Check that the note is no more than ERRMAX bytes (including
+	// the trailing NUL). We should never receive a longer note.
+	len = runtime·findnull((byte*)note);
+	if(len > ERRMAX-1) {
+		runtime·printf("sighandler: note is longer than ERRMAX\n");
+		goto Throw;
+	}
+
+	// See if the note matches one of the patterns in runtime·sigtab.
+	// Notes that do not match any pattern can be handled at a higher
+	// level by the program but will otherwise be ignored.
+	flags = SigNotify;
+	for(sig = 0; sig < nelem(runtime·sigtab); sig++) {
+		t = &runtime·sigtab[sig];
+		n = runtime·findnull((byte*)t->name);
+		if(len < n)
+			continue;
+		if(runtime·strncmp((byte*)note, (byte*)t->name, n) == 0) {
+			flags = t->flags;
+			break;
+		}
+	}
+
+	if(flags & SigGoExit)
+		runtime·exits(note+9); // Strip "go: exit " prefix.
+
+	if(flags & SigPanic) {
+		// Copy the error string from sigtramp's stack into m->notesig so
+		// we can reliably access it from the panic routines.
+		runtime·memmove(g->m->notesig, note, len+1);
+
+		gp->sig = sig;
+		gp->sigpc = ureg->pc;
+
+		// Only push runtime·sigpanic if PC != 0.
+		//
+		// If PC == 0, probably panicked because of a call to a nil func.
+		// Not pushing that onto SP will make the trace look like a call
+		// to runtime·sigpanic instead. (Otherwise the trace will end at
+		// runtime·sigpanic and we won't get to see who faulted).
+		if(ureg->pc != 0) {
+			sp = (uintptr*)ureg->sp;
+			*--sp = ureg->pc;
+			ureg->sp = (uint32)sp;
+		}
+		ureg->pc = (uintptr)runtime·sigpanic;
+		return NCONT;
+	}
+
+	if(flags & SigNotify) {
+		// TODO(ality): See if os/signal wants it.
+		//if(runtime·sigsend(...))
+		//	return NCONT;
+	}
+	if(flags & SigKill)
+		goto Exit;
+	if(!(flags & SigThrow))
+		return NCONT;
+
+Throw:
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
+	runtime·startpanic();
+
+	runtime·printf("%s\n", note);
+	runtime·printf("PC=%x\n", ureg->pc);
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)) {
+		runtime·goroutineheader(gp);
+		runtime·tracebacktrap(ureg->pc, ureg->sp, 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(ureg);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+Exit:
+	runtime·goexitsall(note);
+	runtime·exits(note);
+	return NDFLT; // not reached
+}
+
+void
+runtime·sigenable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·sigdisable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·resetcpuprofiler(int32 hz)
+{
+	// TODO: Enable profiling interrupts.
+	
+	g->m->profilehz = hz;
+}
diff --git a/src/runtime/os_plan9_amd64.c b/src/runtime/os_plan9_amd64.c
new file mode 100644
index 000000000..a9dc0eb96
--- /dev/null
+++ b/src/runtime/os_plan9_amd64.c
@@ -0,0 +1,158 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file. 
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Ureg *u)
+{
+	runtime·printf("ax	%X\n", u->ax);
+	runtime·printf("bx	%X\n", u->bx);
+	runtime·printf("cx	%X\n", u->cx);
+	runtime·printf("dx	%X\n", u->dx);
+	runtime·printf("di	%X\n", u->di);
+	runtime·printf("si	%X\n", u->si);
+	runtime·printf("bp	%X\n", u->bp);
+	runtime·printf("sp	%X\n", u->sp);
+	runtime·printf("r8	%X\n", u->r8);
+	runtime·printf("r9	%X\n", u->r9);
+	runtime·printf("r10	%X\n", u->r10);
+	runtime·printf("r11	%X\n", u->r11);
+	runtime·printf("r12	%X\n", u->r12);
+	runtime·printf("r13	%X\n", u->r13);
+	runtime·printf("r14	%X\n", u->r14);
+	runtime·printf("r15	%X\n", u->r15);
+	runtime·printf("ip	%X\n", u->ip);
+	runtime·printf("flags	%X\n", u->flags);
+	runtime·printf("cs	%X\n", (uint64)u->cs);
+	runtime·printf("fs	%X\n", (uint64)u->fs);
+	runtime·printf("gs	%X\n", (uint64)u->gs);
+}
+
+int32
+runtime·sighandler(void *v, int8 *note, G *gp)
+{
+	uintptr *sp;
+	SigTab *t;
+	bool crash;
+	Ureg *ureg;
+	intgo len, n;
+	int32 sig, flags;
+
+	ureg = (Ureg*)v;
+
+	// The kernel will never pass us a nil note or ureg so we probably
+	// made a mistake somewhere in runtime·sigtramp.
+	if(ureg == nil || note == nil) {
+		runtime·printf("sighandler: ureg %p note %p\n", ureg, note);
+		goto Throw;
+	}
+
+	// Check that the note is no more than ERRMAX bytes (including
+	// the trailing NUL). We should never receive a longer note.
+	len = runtime·findnull((byte*)note);
+	if(len > ERRMAX-1) {
+		runtime·printf("sighandler: note is longer than ERRMAX\n");
+		goto Throw;
+	}
+
+	// See if the note matches one of the patterns in runtime·sigtab.
+	// Notes that do not match any pattern can be handled at a higher
+	// level by the program but will otherwise be ignored.
+	flags = SigNotify;
+	for(sig = 0; sig < nelem(runtime·sigtab); sig++) {
+		t = &runtime·sigtab[sig];
+		n = runtime·findnull((byte*)t->name);
+		if(len < n)
+			continue;
+		if(runtime·strncmp((byte*)note, (byte*)t->name, n) == 0) {
+			flags = t->flags;
+			break;
+		}
+	}
+
+	if(flags & SigGoExit)
+		runtime·exits(note+9); // Strip "go: exit " prefix.
+
+	if(flags & SigPanic) {
+		// Copy the error string from sigtramp's stack into m->notesig so
+		// we can reliably access it from the panic routines.
+		runtime·memmove(g->m->notesig, note, len+1);
+
+		gp->sig = sig;
+		gp->sigpc = ureg->ip;
+
+		// Only push runtime·sigpanic if PC != 0.
+		//
+		// If PC == 0, probably panicked because of a call to a nil func.
+		// Not pushing that onto SP will make the trace look like a call
+		// to runtime·sigpanic instead. (Otherwise the trace will end at
+		// runtime·sigpanic and we won't get to see who faulted).
+		if(ureg->ip != 0) {
+			sp = (uintptr*)ureg->sp;
+			*--sp = ureg->ip;
+			ureg->sp = (uint64)sp;
+		}
+		ureg->ip = (uintptr)runtime·sigpanic;
+		return NCONT;
+	}
+
+	if(flags & SigNotify) {
+		// TODO(ality): See if os/signal wants it.
+		//if(runtime·sigsend(...))
+		//	return NCONT;
+	}
+	if(flags & SigKill)
+		goto Exit;
+	if(!(flags & SigThrow))
+		return NCONT;
+
+Throw:
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
+	runtime·startpanic();
+
+	runtime·printf("%s\n", note);
+	runtime·printf("PC=%X\n", ureg->ip);
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)) {
+		runtime·goroutineheader(gp);
+		runtime·tracebacktrap(ureg->ip, ureg->sp, 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(ureg);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+Exit:
+	runtime·goexitsall(note);
+	runtime·exits(note);
+	return NDFLT; // not reached
+}
+
+void
+runtime·sigenable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·sigdisable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·resetcpuprofiler(int32 hz)
+{
+	// TODO: Enable profiling interrupts.
+	
+	g->m->profilehz = hz;
+}
diff --git a/src/runtime/os_solaris.c b/src/runtime/os_solaris.c
new file mode 100644
index 000000000..e16b8e637
--- /dev/null
+++ b/src/runtime/os_solaris.c
@@ -0,0 +1,557 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+#include "stack.h"
+#include "textflag.h"
+
+#pragma dynexport runtime·end _end
+#pragma dynexport runtime·etext _etext
+#pragma dynexport runtime·edata _edata
+
+#pragma dynimport libc·___errno ___errno "libc.so"
+#pragma dynimport libc·clock_gettime clock_gettime "libc.so"
+#pragma dynimport libc·close close "libc.so"
+#pragma dynimport libc·exit exit "libc.so"
+#pragma dynimport libc·fstat fstat "libc.so"
+#pragma dynimport libc·getcontext getcontext "libc.so"
+#pragma dynimport libc·getrlimit getrlimit "libc.so"
+#pragma dynimport libc·malloc malloc "libc.so"
+#pragma dynimport libc·mmap mmap "libc.so"
+#pragma dynimport libc·munmap munmap "libc.so"
+#pragma dynimport libc·open open "libc.so"
+#pragma dynimport libc·pthread_attr_destroy pthread_attr_destroy "libc.so"
+#pragma dynimport libc·pthread_attr_getstack pthread_attr_getstack "libc.so"
+#pragma dynimport libc·pthread_attr_init pthread_attr_init "libc.so"
+#pragma dynimport libc·pthread_attr_setdetachstate pthread_attr_setdetachstate "libc.so"
+#pragma dynimport libc·pthread_attr_setstack pthread_attr_setstack "libc.so"
+#pragma dynimport libc·pthread_create pthread_create "libc.so"
+#pragma dynimport libc·raise raise "libc.so"
+#pragma dynimport libc·read read "libc.so"
+#pragma dynimport libc·select select "libc.so"
+#pragma dynimport libc·sched_yield sched_yield "libc.so"
+#pragma dynimport libc·sem_init sem_init "libc.so"
+#pragma dynimport libc·sem_post sem_post "libc.so"
+#pragma dynimport libc·sem_reltimedwait_np sem_reltimedwait_np "libc.so"
+#pragma dynimport libc·sem_wait sem_wait "libc.so"
+#pragma dynimport libc·setitimer setitimer "libc.so"
+#pragma dynimport libc·sigaction sigaction "libc.so"
+#pragma dynimport libc·sigaltstack sigaltstack "libc.so"
+#pragma dynimport libc·sigprocmask sigprocmask "libc.so"
+#pragma dynimport libc·sysconf sysconf "libc.so"
+#pragma dynimport libc·usleep usleep "libc.so"
+#pragma dynimport libc·write write "libc.so"
+
+extern uintptr libc·___errno;
+extern uintptr libc·clock_gettime;
+extern uintptr libc·close;
+extern uintptr libc·exit;
+extern uintptr libc·fstat;
+extern uintptr libc·getcontext;
+extern uintptr libc·getrlimit;
+extern uintptr libc·malloc;
+extern uintptr libc·mmap;
+extern uintptr libc·munmap;
+extern uintptr libc·open;
+extern uintptr libc·pthread_attr_destroy;
+extern uintptr libc·pthread_attr_getstack;
+extern uintptr libc·pthread_attr_init;
+extern uintptr libc·pthread_attr_setdetachstate;
+extern uintptr libc·pthread_attr_setstack;
+extern uintptr libc·pthread_create;
+extern uintptr libc·raise;
+extern uintptr libc·read;
+extern uintptr libc·sched_yield;
+extern uintptr libc·select;
+extern uintptr libc·sem_init;
+extern uintptr libc·sem_post;
+extern uintptr libc·sem_reltimedwait_np;
+extern uintptr libc·sem_wait;
+extern uintptr libc·setitimer;
+extern uintptr libc·sigaction;
+extern uintptr libc·sigaltstack;
+extern uintptr libc·sigprocmask;
+extern uintptr libc·sysconf;
+extern uintptr libc·usleep;
+extern uintptr libc·write;
+
+void	runtime·getcontext(Ucontext *context);
+int32	runtime·pthread_attr_destroy(PthreadAttr* attr);
+int32	runtime·pthread_attr_init(PthreadAttr* attr);
+int32	runtime·pthread_attr_getstack(PthreadAttr* attr, void** addr, uint64* size);
+int32	runtime·pthread_attr_setdetachstate(PthreadAttr* attr, int32 state);
+int32	runtime·pthread_attr_setstack(PthreadAttr* attr, void* addr, uint64 size);
+int32	runtime·pthread_create(Pthread* thread, PthreadAttr* attr, void(*fn)(void), void *arg);
+uint32	runtime·tstart_sysvicall(M *newm);
+int32	runtime·sem_init(SemT* sem, int32 pshared, uint32 value);
+int32	runtime·sem_post(SemT* sem);
+int32	runtime·sem_reltimedwait_np(SemT* sem, Timespec* timeout);
+int32	runtime·sem_wait(SemT* sem);
+int64	runtime·sysconf(int32 name);
+
+extern SigTab runtime·sigtab[];
+static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0, ~(uint32)0, ~(uint32)0, };
+
+static int32
+getncpu(void) 
+{
+	int32 n;
+	
+	n = (int32)runtime·sysconf(_SC_NPROCESSORS_ONLN);
+	if(n < 1)
+		return 1;
+	return n;
+}
+
+void
+runtime·osinit(void)
+{
+	runtime·ncpu = getncpu(); 
+}
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	PthreadAttr attr;
+	Sigset oset;
+	Pthread tid;
+	int32 ret;
+	uint64 size;
+
+	USED(stk);
+	if(runtime·pthread_attr_init(&attr) != 0)
+		runtime·throw("pthread_attr_init");
+	if(runtime·pthread_attr_setstack(&attr, 0, 0x200000) != 0)
+		runtime·throw("pthread_attr_setstack");
+	size = 0;
+	if(runtime·pthread_attr_getstack(&attr, (void**)&mp->g0->stack.hi, &size) != 0)
+		runtime·throw("pthread_attr_getstack");	
+	mp->g0->stack.lo = mp->g0->stack.hi - size;
+	if(runtime·pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
+		runtime·throw("pthread_attr_setdetachstate");
+
+	// Disable signals during create, so that the new thread starts
+	// with signals disabled.  It will enable them in minit.
+	runtime·sigprocmask(SIG_SETMASK, &sigset_all, &oset);
+	ret = runtime·pthread_create(&tid, &attr, (void (*)(void))runtime·tstart_sysvicall, mp);
+	runtime·sigprocmask(SIG_SETMASK, &oset, nil);
+	if(ret != 0) {
+		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount(), ret);
+		runtime·throw("runtime.newosproc");
+	}
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	#pragma dataflag NOPTR
+	static byte urandom_data[HashRandomBytes];
+	int32 fd;
+	fd = runtime·open("/dev/urandom", 0 /* O_RDONLY */, 0);
+	if(runtime·read(fd, urandom_data, HashRandomBytes) == HashRandomBytes) {
+		*rnd = urandom_data;
+		*rnd_len = HashRandomBytes;
+	} else {
+		*rnd = nil;
+		*rnd_len = 0;
+	}
+	runtime·close(fd);
+}
+
+void
+runtime·goenvs(void)
+{
+	runtime·goenvs_unix();
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+	mp->gsignal->m = mp;
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	runtime·asmcgocall(runtime·miniterrno, (void *)libc·___errno);
+	// Initialize signal handling
+	runtime·signalstack((byte*)g->m->gsignal->stack.lo, 32*1024);
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	Rlimit rl;
+	extern byte runtime·text[], runtime·end[];
+	uintptr used;
+	
+	if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+		return 0;
+	if(rl.rlim_cur >= 0x7fffffff)
+		return 0;
+
+	// Estimate our VM footprint excluding the heap.
+	// Not an exact science: use size of binary plus
+	// some room for thread stacks.
+	used = runtime·end - runtime·text + (64<<20);
+	if(used >= rl.rlim_cur)
+		return 0;
+
+	// If there's not at least 16 MB left, we're probably
+	// not going to be able to do much.  Treat as no limit.
+	rl.rlim_cur -= used;
+	if(rl.rlim_cur < (16<<20))
+		return 0;
+
+	return rl.rlim_cur - used;
+}
+
+void
+runtime·setprof(bool on)
+{
+	USED(on);
+}
+
+extern void runtime·sigtramp(void);
+
+void
+runtime·setsig(int32 i, GoSighandler *fn, bool restart)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask.__sigbits[0] = ~(uint32)0;
+	sa.sa_mask.__sigbits[1] = ~(uint32)0;
+	sa.sa_mask.__sigbits[2] = ~(uint32)0;
+	sa.sa_mask.__sigbits[3] = ~(uint32)0;
+	if(fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	*((void**)&sa._funcptr[0]) = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+GoSighandler*
+runtime·getsig(int32 i)
+{
+	SigactionT sa;
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	runtime·sigaction(i, nil, &sa);
+	if(*((void**)&sa._funcptr[0]) == runtime·sigtramp)
+		return runtime·sighandler;
+	return *((void**)&sa._funcptr[0]);
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	StackT st;
+
+	st.ss_sp = (void*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·unblocksignals(void)
+{
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·semacreate(void)
+{
+	SemT* sem;
+
+	// Call libc's malloc rather than runtime·malloc.  This will
+	// allocate space on the C heap.  We can't call runtime·malloc
+	// here because it could cause a deadlock.
+	g->m->libcall.fn = (uintptr)(void*)libc·malloc;
+	g->m->libcall.n = 1;
+	runtime·memclr((byte*)&g->m->scratch, sizeof(g->m->scratch));
+	g->m->scratch.v[0] = (uintptr)sizeof(*sem);
+	g->m->libcall.args = (uintptr)(uintptr*)&g->m->scratch;
+	runtime·asmcgocall(runtime·asmsysvicall6, &g->m->libcall);
+	sem = (void*)g->m->libcall.r1;
+	if(runtime·sem_init(sem, 0, 0) != 0)
+		runtime·throw("sem_init");
+	return (uintptr)sem;
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	M *m;
+
+	m = g->m;
+	if(ns >= 0) {
+		m->ts.tv_sec = ns / 1000000000LL;
+		m->ts.tv_nsec = ns % 1000000000LL;
+
+		m->libcall.fn = (uintptr)(void*)libc·sem_reltimedwait_np;
+		m->libcall.n = 2;
+		runtime·memclr((byte*)&m->scratch, sizeof(m->scratch));
+		m->scratch.v[0] = m->waitsema;
+		m->scratch.v[1] = (uintptr)&m->ts;
+		m->libcall.args = (uintptr)(uintptr*)&m->scratch;
+		runtime·asmcgocall(runtime·asmsysvicall6, &m->libcall);
+		if(*m->perrno != 0) {
+			if(*m->perrno == ETIMEDOUT || *m->perrno == EAGAIN || *m->perrno == EINTR)
+				return -1;
+			runtime·throw("sem_reltimedwait_np");
+		}
+		return 0;
+	}
+	for(;;) {
+		m->libcall.fn = (uintptr)(void*)libc·sem_wait;
+		m->libcall.n = 1;
+		runtime·memclr((byte*)&m->scratch, sizeof(m->scratch));
+		m->scratch.v[0] = m->waitsema;
+		m->libcall.args = (uintptr)(uintptr*)&m->scratch;
+		runtime·asmcgocall(runtime·asmsysvicall6, &m->libcall);
+		if(m->libcall.r1 == 0)
+			break;
+		if(*m->perrno == EINTR) 
+			continue;
+		runtime·throw("sem_wait");
+	}
+	return 0;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	SemT* sem = (SemT*)mp->waitsema;
+	if(runtime·sem_post(sem) != 0)
+		runtime·throw("sem_post");
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·close(int32 fd)
+{
+	return runtime·sysvicall1(libc·close, (uintptr)fd);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·exit(int32 r)
+{
+	runtime·sysvicall1(libc·exit, (uintptr)r);
+}
+
+#pragma textflag NOSPLIT
+/* int32 */ void
+runtime·getcontext(Ucontext* context)
+{
+	runtime·sysvicall1(libc·getcontext, (uintptr)context);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·getrlimit(int32 res, Rlimit* rlp)
+{
+	return runtime·sysvicall2(libc·getrlimit, (uintptr)res, (uintptr)rlp);
+}
+
+#pragma textflag NOSPLIT
+uint8*
+runtime·mmap(byte* addr, uintptr len, int32 prot, int32 flags, int32 fildes, uint32 off)
+{
+	return (uint8*)runtime·sysvicall6(libc·mmap, (uintptr)addr, (uintptr)len, (uintptr)prot, (uintptr)flags, (uintptr)fildes, (uintptr)off);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·munmap(byte* addr, uintptr len)
+{
+	runtime·sysvicall2(libc·munmap, (uintptr)addr, (uintptr)len);
+}
+
+extern int64 runtime·nanotime1(void);
+#pragma textflag NOSPLIT
+int64
+runtime·nanotime(void)
+{
+	return runtime·sysvicall0((uintptr)runtime·nanotime1);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·open(int8* path, int32 oflag, int32 mode)
+{
+	return runtime·sysvicall3(libc·open, (uintptr)path, (uintptr)oflag, (uintptr)mode);
+}
+
+int32
+runtime·pthread_attr_destroy(PthreadAttr* attr)
+{
+	return runtime·sysvicall1(libc·pthread_attr_destroy, (uintptr)attr);
+}
+
+int32
+runtime·pthread_attr_getstack(PthreadAttr* attr, void** addr, uint64* size)
+{
+	return runtime·sysvicall3(libc·pthread_attr_getstack, (uintptr)attr, (uintptr)addr, (uintptr)size);
+}
+
+int32
+runtime·pthread_attr_init(PthreadAttr* attr)
+{
+	return runtime·sysvicall1(libc·pthread_attr_init, (uintptr)attr);
+}
+
+int32
+runtime·pthread_attr_setdetachstate(PthreadAttr* attr, int32 state)
+{
+	return runtime·sysvicall2(libc·pthread_attr_setdetachstate, (uintptr)attr, (uintptr)state);
+}
+
+int32
+runtime·pthread_attr_setstack(PthreadAttr* attr, void* addr, uint64 size)
+{
+	return runtime·sysvicall3(libc·pthread_attr_setstack, (uintptr)attr, (uintptr)addr, (uintptr)size);
+}
+
+int32
+runtime·pthread_create(Pthread* thread, PthreadAttr* attr, void(*fn)(void), void *arg)
+{
+	return runtime·sysvicall4(libc·pthread_create, (uintptr)thread, (uintptr)attr, (uintptr)fn, (uintptr)arg);
+}
+
+/* int32 */ void
+runtime·raise(int32 sig)
+{
+	runtime·sysvicall1(libc·raise, (uintptr)sig);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·read(int32 fd, void* buf, int32 nbyte)
+{
+	return runtime·sysvicall3(libc·read, (uintptr)fd, (uintptr)buf, (uintptr)nbyte);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·sem_init(SemT* sem, int32 pshared, uint32 value)
+{
+	return runtime·sysvicall3(libc·sem_init, (uintptr)sem, (uintptr)pshared, (uintptr)value);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·sem_post(SemT* sem)
+{
+	return runtime·sysvicall1(libc·sem_post, (uintptr)sem);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·sem_reltimedwait_np(SemT* sem, Timespec* timeout)
+{
+	return runtime·sysvicall2(libc·sem_reltimedwait_np, (uintptr)sem, (uintptr)timeout);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·sem_wait(SemT* sem)
+{
+	return runtime·sysvicall1(libc·sem_wait, (uintptr)sem);
+}
+
+/* int32 */ void
+runtime·setitimer(int32 which, Itimerval* value, Itimerval* ovalue)
+{
+	runtime·sysvicall3(libc·setitimer, (uintptr)which, (uintptr)value, (uintptr)ovalue);
+}
+
+/* int32 */ void
+runtime·sigaction(int32 sig, struct SigactionT* act, struct SigactionT* oact)
+{
+	runtime·sysvicall3(libc·sigaction, (uintptr)sig, (uintptr)act, (uintptr)oact);
+}
+
+/* int32 */ void
+runtime·sigaltstack(SigaltstackT* ss, SigaltstackT* oss)
+{
+	runtime·sysvicall2(libc·sigaltstack, (uintptr)ss, (uintptr)oss);
+}
+
+/* int32 */ void
+runtime·sigprocmask(int32 how, Sigset* set, Sigset* oset)
+{
+	runtime·sysvicall3(libc·sigprocmask, (uintptr)how, (uintptr)set, (uintptr)oset);
+}
+
+int64
+runtime·sysconf(int32 name)
+{
+	return runtime·sysvicall1(libc·sysconf, (uintptr)name);
+}
+
+extern void runtime·usleep1(uint32);
+
+#pragma textflag NOSPLIT
+void
+runtime·usleep(uint32 µs)
+{
+	runtime·usleep1(µs);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·write(uintptr fd, void* buf, int32 nbyte)
+{
+	return runtime·sysvicall3(libc·write, (uintptr)fd, (uintptr)buf, (uintptr)nbyte);
+}
+
+extern void runtime·osyield1(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·osyield(void)
+{
+	// Check the validity of m because we might be called in cgo callback
+	// path early enough where there isn't a m available yet.
+	if(g && g->m != nil) {
+		runtime·sysvicall0(libc·sched_yield);
+		return;
+	}
+	runtime·osyield1();
+}
+
+#pragma textflag NOSPLIT
+int8*
+runtime·signame(int32 sig)
+{
+	return runtime·sigtab[sig].name;
+}
diff --git a/src/runtime/os_solaris.go b/src/runtime/os_solaris.go
new file mode 100644
index 000000000..ca1315120
--- /dev/null
+++ b/src/runtime/os_solaris.go
@@ -0,0 +1,100 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func setitimer(mode int32, new, old unsafe.Pointer)
+func sigaction(sig int32, new, old unsafe.Pointer)
+func sigaltstack(new, old unsafe.Pointer)
+func sigprocmask(mode int32, new, old unsafe.Pointer)
+func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+func getrlimit(kind int32, limit unsafe.Pointer)
+func miniterrno(fn unsafe.Pointer)
+func raise(sig int32)
+func getcontext(ctxt unsafe.Pointer)
+func tstart_sysvicall(mm unsafe.Pointer) uint32
+func nanotime1() int64
+func usleep1(usec uint32)
+func osyield1()
+func netpollinit()
+func netpollopen(fd uintptr, pd *pollDesc) int32
+func netpollclose(fd uintptr) int32
+func netpollarm(pd *pollDesc, mode int)
+
+type libcFunc byte
+
+var asmsysvicall6 libcFunc
+
+//go:nosplit
+func sysvicall0(fn *libcFunc) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 0
+	// TODO(rsc): Why is noescape necessary here and below?
+	libcall.args = uintptr(noescape(unsafe.Pointer(&fn))) // it's unused but must be non-nil, otherwise crashes
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
+
+//go:nosplit
+func sysvicall1(fn *libcFunc, a1 uintptr) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 1
+	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
+
+//go:nosplit
+func sysvicall2(fn *libcFunc, a1, a2 uintptr) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 2
+	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
+
+//go:nosplit
+func sysvicall3(fn *libcFunc, a1, a2, a3 uintptr) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 3
+	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
+
+//go:nosplit
+func sysvicall4(fn *libcFunc, a1, a2, a3, a4 uintptr) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 4
+	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
+
+//go:nosplit
+func sysvicall5(fn *libcFunc, a1, a2, a3, a4, a5 uintptr) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 5
+	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
+
+//go:nosplit
+func sysvicall6(fn *libcFunc, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
+	libcall := &getg().m.libcall
+	libcall.fn = uintptr(unsafe.Pointer(fn))
+	libcall.n = 6
+	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	return libcall.r1
+}
diff --git a/src/runtime/os_solaris.h b/src/runtime/os_solaris.h
new file mode 100644
index 000000000..3d9e1a240
--- /dev/null
+++ b/src/runtime/os_solaris.h
@@ -0,0 +1,55 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+typedef uintptr kevent_udata;
+
+struct sigaction;
+
+void	runtime·sigpanic(void);
+
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+void	runtime·sigaction(int32, struct SigactionT*, struct SigactionT*);
+void	runtime·sigaltstack(SigaltstackT*, SigaltstackT*);
+void	runtime·sigprocmask(int32, Sigset*, Sigset*);
+void	runtime·unblocksignals(void);
+int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
+
+
+void	runtime·raisesigpipe(void);
+void	runtime·setsig(int32, void(*)(int32, Siginfo*, void*, G*), bool);
+void	runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp);
+void	runtime·sigpanic(void);
+
+enum {
+	SS_DISABLE = 2,
+	SIG_BLOCK = 1,
+	SIG_UNBLOCK = 2,
+	SIG_SETMASK = 3,
+	NSIG = 73, /* number of signals in runtime·SigTab array */
+	SI_USER = 0,
+	_UC_SIGMASK = 0x01,
+	_UC_CPU = 0x04,
+	RLIMIT_AS = 10,
+};
+
+typedef struct Rlimit Rlimit;
+struct Rlimit {
+	int64   rlim_cur;
+	int64   rlim_max;
+};
+int32   runtime·getrlimit(int32, Rlimit*);
+
+// Call an external library function described by {fn, a0, ..., an}, with
+// SysV conventions, switching to os stack during the call, if necessary.
+uintptr	runtime·sysvicall0(uintptr fn);
+uintptr	runtime·sysvicall1(uintptr fn, uintptr a1);
+uintptr	runtime·sysvicall2(uintptr fn, uintptr a1, uintptr a2);
+uintptr	runtime·sysvicall3(uintptr fn, uintptr a1, uintptr a2, uintptr a3);
+uintptr	runtime·sysvicall4(uintptr fn, uintptr a1, uintptr a2, uintptr a3, uintptr a4);
+uintptr	runtime·sysvicall5(uintptr fn, uintptr a1, uintptr a2, uintptr a3, uintptr a4, uintptr a5);
+uintptr	runtime·sysvicall6(uintptr fn, uintptr a1, uintptr a2, uintptr a3, uintptr a4, uintptr a5, uintptr a6);
+void	runtime·asmsysvicall6(void *c);
+
+void	runtime·miniterrno(void *fn);
diff --git a/src/runtime/os_windows.c b/src/runtime/os_windows.c
new file mode 100644
index 000000000..b8b8eda5f
--- /dev/null
+++ b/src/runtime/os_windows.c
@@ -0,0 +1,636 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "type.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "textflag.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+
+#pragma dynimport runtime·AddVectoredExceptionHandler AddVectoredExceptionHandler "kernel32.dll"
+#pragma dynimport runtime·CloseHandle CloseHandle "kernel32.dll"
+#pragma dynimport runtime·CreateEvent CreateEventA "kernel32.dll"
+#pragma dynimport runtime·CreateThread CreateThread "kernel32.dll"
+#pragma dynimport runtime·CreateWaitableTimer CreateWaitableTimerA "kernel32.dll"
+#pragma dynimport runtime·CryptAcquireContextW CryptAcquireContextW "advapi32.dll"
+#pragma dynimport runtime·CryptGenRandom CryptGenRandom "advapi32.dll"
+#pragma dynimport runtime·CryptReleaseContext CryptReleaseContext "advapi32.dll"
+#pragma dynimport runtime·DuplicateHandle DuplicateHandle "kernel32.dll"
+#pragma dynimport runtime·ExitProcess ExitProcess "kernel32.dll"
+#pragma dynimport runtime·FreeEnvironmentStringsW FreeEnvironmentStringsW "kernel32.dll"
+#pragma dynimport runtime·GetEnvironmentStringsW GetEnvironmentStringsW "kernel32.dll"
+#pragma dynimport runtime·GetProcAddress GetProcAddress "kernel32.dll"
+#pragma dynimport runtime·GetStdHandle GetStdHandle "kernel32.dll"
+#pragma dynimport runtime·GetSystemInfo GetSystemInfo "kernel32.dll"
+#pragma dynimport runtime·GetThreadContext GetThreadContext "kernel32.dll"
+#pragma dynimport runtime·LoadLibrary LoadLibraryW "kernel32.dll"
+#pragma dynimport runtime·LoadLibraryA LoadLibraryA "kernel32.dll"
+#pragma dynimport runtime·NtWaitForSingleObject NtWaitForSingleObject "ntdll.dll"
+#pragma dynimport runtime·ResumeThread ResumeThread "kernel32.dll"
+#pragma dynimport runtime·SetConsoleCtrlHandler SetConsoleCtrlHandler "kernel32.dll"
+#pragma dynimport runtime·SetEvent SetEvent "kernel32.dll"
+#pragma dynimport runtime·SetProcessPriorityBoost SetProcessPriorityBoost "kernel32.dll"
+#pragma dynimport runtime·SetThreadPriority SetThreadPriority "kernel32.dll"
+#pragma dynimport runtime·SetUnhandledExceptionFilter SetUnhandledExceptionFilter "kernel32.dll"
+#pragma dynimport runtime·SetWaitableTimer SetWaitableTimer "kernel32.dll"
+#pragma dynimport runtime·Sleep Sleep "kernel32.dll"
+#pragma dynimport runtime·SuspendThread SuspendThread "kernel32.dll"
+#pragma dynimport runtime·WaitForSingleObject WaitForSingleObject "kernel32.dll"
+#pragma dynimport runtime·WriteFile WriteFile "kernel32.dll"
+#pragma dynimport runtime·timeBeginPeriod timeBeginPeriod "winmm.dll"
+
+extern void *runtime·AddVectoredExceptionHandler;
+extern void *runtime·CloseHandle;
+extern void *runtime·CreateEvent;
+extern void *runtime·CreateThread;
+extern void *runtime·CreateWaitableTimer;
+extern void *runtime·CryptAcquireContextW;
+extern void *runtime·CryptGenRandom;
+extern void *runtime·CryptReleaseContext;
+extern void *runtime·DuplicateHandle;
+extern void *runtime·ExitProcess;
+extern void *runtime·FreeEnvironmentStringsW;
+extern void *runtime·GetEnvironmentStringsW;
+extern void *runtime·GetProcAddress;
+extern void *runtime·GetStdHandle;
+extern void *runtime·GetSystemInfo;
+extern void *runtime·GetThreadContext;
+extern void *runtime·LoadLibrary;
+extern void *runtime·LoadLibraryA;
+extern void *runtime·NtWaitForSingleObject;
+extern void *runtime·ResumeThread;
+extern void *runtime·SetConsoleCtrlHandler;
+extern void *runtime·SetEvent;
+extern void *runtime·SetProcessPriorityBoost;
+extern void *runtime·SetThreadPriority;
+extern void *runtime·SetUnhandledExceptionFilter;
+extern void *runtime·SetWaitableTimer;
+extern void *runtime·Sleep;
+extern void *runtime·SuspendThread;
+extern void *runtime·WaitForSingleObject;
+extern void *runtime·WriteFile;
+extern void *runtime·timeBeginPeriod;
+
+#pragma dataflag NOPTR
+void *runtime·GetQueuedCompletionStatusEx;
+
+extern uintptr runtime·externalthreadhandlerp;
+void runtime·externalthreadhandler(void);
+void runtime·exceptiontramp(void);
+void runtime·firstcontinuetramp(void);
+void runtime·lastcontinuetramp(void);
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·getLoadLibrary(void)
+{
+	return (uintptr)runtime·LoadLibrary;
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·getGetProcAddress(void)
+{
+	return (uintptr)runtime·GetProcAddress;
+}
+
+static int32
+getproccount(void)
+{
+	SystemInfo info;
+
+	runtime·stdcall1(runtime·GetSystemInfo, (uintptr)&info);
+	return info.dwNumberOfProcessors;
+}
+
+void
+runtime·osinit(void)
+{
+	void *kernel32;
+	void *addVectoredContinueHandler;
+
+	kernel32 = runtime·stdcall1(runtime·LoadLibraryA, (uintptr)"kernel32.dll");
+
+	runtime·externalthreadhandlerp = (uintptr)runtime·externalthreadhandler;
+
+	runtime·stdcall2(runtime·AddVectoredExceptionHandler, 1, (uintptr)runtime·exceptiontramp);
+	addVectoredContinueHandler = nil;
+	if(kernel32 != nil)
+		addVectoredContinueHandler = runtime·stdcall2(runtime·GetProcAddress, (uintptr)kernel32, (uintptr)"AddVectoredContinueHandler");
+	if(addVectoredContinueHandler == nil || sizeof(void*) == 4) {
+		// use SetUnhandledExceptionFilter for windows-386 or
+		// if VectoredContinueHandler is unavailable.
+		// note: SetUnhandledExceptionFilter handler won't be called, if debugging.
+		runtime·stdcall1(runtime·SetUnhandledExceptionFilter, (uintptr)runtime·lastcontinuetramp);
+	} else {
+		runtime·stdcall2(addVectoredContinueHandler, 1, (uintptr)runtime·firstcontinuetramp);
+		runtime·stdcall2(addVectoredContinueHandler, 0, (uintptr)runtime·lastcontinuetramp);
+	}
+
+	runtime·stdcall2(runtime·SetConsoleCtrlHandler, (uintptr)runtime·ctrlhandler, 1);
+
+	runtime·stdcall1(runtime·timeBeginPeriod, 1);
+
+	runtime·ncpu = getproccount();
+	
+	// Windows dynamic priority boosting assumes that a process has different types
+	// of dedicated threads -- GUI, IO, computational, etc. Go processes use
+	// equivalent threads that all do a mix of GUI, IO, computations, etc.
+	// In such context dynamic priority boosting does nothing but harm, so we turn it off.
+	runtime·stdcall2(runtime·SetProcessPriorityBoost, -1, 1);
+
+	if(kernel32 != nil) {
+		runtime·GetQueuedCompletionStatusEx = runtime·stdcall2(runtime·GetProcAddress, (uintptr)kernel32, (uintptr)"GetQueuedCompletionStatusEx");
+	}
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·get_random_data(byte **rnd, int32 *rnd_len)
+{
+	uintptr handle;
+	*rnd = nil;
+	*rnd_len = 0;
+	if(runtime·stdcall5(runtime·CryptAcquireContextW, (uintptr)&handle, (uintptr)nil, (uintptr)nil,
+			   1 /* PROV_RSA_FULL */,
+			   0xf0000000U /* CRYPT_VERIFYCONTEXT */) != 0) {
+		static byte random_data[HashRandomBytes];
+		if(runtime·stdcall3(runtime·CryptGenRandom, handle, HashRandomBytes, (uintptr)&random_data[0])) {
+			*rnd = random_data;
+			*rnd_len = HashRandomBytes;
+		}
+		runtime·stdcall2(runtime·CryptReleaseContext, handle, 0);
+	}
+}
+
+void
+runtime·goenvs(void)
+{
+	extern Slice runtime·envs;
+
+	uint16 *env;
+	String *s;
+	int32 i, n;
+	uint16 *p;
+
+	env = runtime·stdcall0(runtime·GetEnvironmentStringsW);
+
+	n = 0;
+	for(p=env; *p; n++)
+		p += runtime·findnullw(p)+1;
+
+	runtime·envs = runtime·makeStringSlice(n);
+	s = (String*)runtime·envs.array;
+
+	p = env;
+	for(i=0; i<n; i++) {
+		s[i] = runtime·gostringw(p);
+		p += runtime·findnullw(p)+1;
+	}
+
+	runtime·stdcall1(runtime·FreeEnvironmentStringsW, (uintptr)env);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·exit(int32 code)
+{
+	runtime·stdcall1(runtime·ExitProcess, code);
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·write(uintptr fd, void *buf, int32 n)
+{
+	void *handle;
+	uint32 written;
+
+	written = 0;
+	switch(fd) {
+	case 1:
+		handle = runtime·stdcall1(runtime·GetStdHandle, -11);
+		break;
+	case 2:
+		handle = runtime·stdcall1(runtime·GetStdHandle, -12);
+		break;
+	default:
+		// assume fd is real windows handle.
+		handle = (void*)fd;
+		break;
+	}
+	runtime·stdcall5(runtime·WriteFile, (uintptr)handle, (uintptr)buf, n, (uintptr)&written, 0);
+	return written;
+}
+
+#define INFINITE ((uintptr)0xFFFFFFFF)
+
+#pragma textflag NOSPLIT
+int32
+runtime·semasleep(int64 ns)
+{
+	// store ms in ns to save stack space
+	if(ns < 0)
+		ns = INFINITE;
+	else {
+		ns = runtime·timediv(ns, 1000000, nil);
+		if(ns == 0)
+			ns = 1;
+	}
+	if(runtime·stdcall2(runtime·WaitForSingleObject, (uintptr)g->m->waitsema, ns) != 0)
+		return -1;  // timeout
+	return 0;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·semawakeup(M *mp)
+{
+	runtime·stdcall1(runtime·SetEvent, mp->waitsema);
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·semacreate(void)
+{
+	return (uintptr)runtime·stdcall4(runtime·CreateEvent, 0, 0, 0, 0);
+}
+
+#define STACK_SIZE_PARAM_IS_A_RESERVATION ((uintptr)0x00010000)
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	void *thandle;
+
+	USED(stk);
+
+	thandle = runtime·stdcall6(runtime·CreateThread,
+		(uintptr)nil, 0x20000, (uintptr)runtime·tstart_stdcall, (uintptr)mp,
+		STACK_SIZE_PARAM_IS_A_RESERVATION, (uintptr)nil);
+	if(thandle == nil) {
+		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount(), runtime·getlasterror());
+		runtime·throw("runtime.newosproc");
+	}
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	USED(mp);
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+void
+runtime·minit(void)
+{
+	uintptr thandle;
+
+	// -1 = current process, -2 = current thread
+	runtime·stdcall7(runtime·DuplicateHandle, -1, -2, -1, (uintptr)&thandle, 0, 0, DUPLICATE_SAME_ACCESS);
+	runtime·atomicstoreuintptr(&g->m->thread, thandle);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·stdcall1(runtime·CloseHandle, g->m->thread);
+	g->m->thread = 0;
+}
+
+// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
+typedef struct KSYSTEM_TIME {
+	uint32	LowPart;
+	int32	High1Time;
+	int32	High2Time;
+} KSYSTEM_TIME;
+
+#pragma dataflag NOPTR
+const KSYSTEM_TIME* INTERRUPT_TIME	= (KSYSTEM_TIME*)0x7ffe0008;
+#pragma dataflag NOPTR
+const KSYSTEM_TIME* SYSTEM_TIME		= (KSYSTEM_TIME*)0x7ffe0014;
+
+static void badsystime(void);
+
+#pragma textflag NOSPLIT
+int64
+runtime·systime(KSYSTEM_TIME *timeaddr)
+{
+	KSYSTEM_TIME t;
+	int32 i;
+	void (*fn)(void);
+
+	for(i = 1; i < 10000; i++) {
+		// these fields must be read in that order (see URL above)
+		t.High1Time = timeaddr->High1Time;
+		t.LowPart = timeaddr->LowPart;
+		t.High2Time = timeaddr->High2Time;
+		if(t.High1Time == t.High2Time)
+			return (int64)t.High1Time<<32 | t.LowPart;
+		if((i%100) == 0)
+			runtime·osyield();
+	}
+	fn = badsystime;
+	runtime·onM(&fn);
+	return 0;
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·unixnano(void)
+{
+	return (runtime·systime(SYSTEM_TIME) - 116444736000000000LL) * 100LL;
+}
+
+static void
+badsystime(void)
+{
+	runtime·throw("interrupt/system time is changing too fast");
+}
+
+#pragma textflag NOSPLIT
+int64
+runtime·nanotime(void)
+{
+	return runtime·systime(INTERRUPT_TIME) * 100LL;
+}
+
+// Calling stdcall on os stack.
+#pragma textflag NOSPLIT
+static void*
+stdcall(void *fn)
+{
+	g->m->libcall.fn = (uintptr)fn;
+	if(g->m->profilehz != 0) {
+		// leave pc/sp for cpu profiler
+		g->m->libcallg = g;
+		g->m->libcallpc = (uintptr)runtime·getcallerpc(&fn);
+		// sp must be the last, because once async cpu profiler finds
+		// all three values to be non-zero, it will use them
+		g->m->libcallsp = (uintptr)runtime·getcallersp(&fn);
+	}
+	runtime·asmcgocall(runtime·asmstdcall, &g->m->libcall);
+	g->m->libcallsp = 0;
+	return (void*)g->m->libcall.r1;
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall0(void *fn)
+{
+	g->m->libcall.n = 0;
+	g->m->libcall.args = (uintptr)&fn;  // it's unused but must be non-nil, otherwise crashes
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall1(void *fn, uintptr a0)
+{
+	USED(a0);
+	g->m->libcall.n = 1;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall2(void *fn, uintptr a0, uintptr a1)
+{
+	USED(a0, a1);
+	g->m->libcall.n = 2;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall3(void *fn, uintptr a0, uintptr a1, uintptr a2)
+{
+	USED(a0, a1, a2);
+	g->m->libcall.n = 3;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall4(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3)
+{
+	USED(a0, a1, a2, a3);
+	g->m->libcall.n = 4;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall5(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3, uintptr a4)
+{
+	USED(a0, a1, a2, a3, a4);
+	g->m->libcall.n = 5;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall6(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3, uintptr a4, uintptr a5)
+{
+	USED(a0, a1, a2, a3, a4, a5);
+	g->m->libcall.n = 6;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+#pragma textflag NOSPLIT
+void*
+runtime·stdcall7(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3, uintptr a4, uintptr a5, uintptr a6)
+{
+	USED(a0, a1, a2, a3, a4, a5, a6);
+	g->m->libcall.n = 7;
+	g->m->libcall.args = (uintptr)&a0;
+	return stdcall(fn);
+}
+
+extern void runtime·usleep1(uint32);
+
+#pragma textflag NOSPLIT
+void
+runtime·osyield(void)
+{
+	runtime·usleep1(1);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·usleep(uint32 us)
+{
+	// Have 1us units; want 100ns units.
+	runtime·usleep1(10*us);
+}
+
+uint32
+runtime·issigpanic(uint32 code)
+{
+	switch(code) {
+	case EXCEPTION_ACCESS_VIOLATION:
+	case EXCEPTION_INT_DIVIDE_BY_ZERO:
+	case EXCEPTION_INT_OVERFLOW:
+	case EXCEPTION_FLT_DENORMAL_OPERAND:
+	case EXCEPTION_FLT_DIVIDE_BY_ZERO:
+	case EXCEPTION_FLT_INEXACT_RESULT:
+	case EXCEPTION_FLT_OVERFLOW:
+	case EXCEPTION_FLT_UNDERFLOW:
+	case EXCEPTION_BREAKPOINT:
+		return 1;
+	}
+	return 0;
+}
+
+void
+runtime·initsig(void)
+{
+	// following line keeps these functions alive at link stage
+	// if there's a better way please write it here
+	void *e = runtime·exceptiontramp;
+	void *f = runtime·firstcontinuetramp;
+	void *l = runtime·lastcontinuetramp;
+	USED(e);
+	USED(f);
+	USED(l);
+}
+
+uint32
+runtime·ctrlhandler1(uint32 type)
+{
+	int32 s;
+
+	switch(type) {
+	case CTRL_C_EVENT:
+	case CTRL_BREAK_EVENT:
+		s = SIGINT;
+		break;
+	default:
+		return 0;
+	}
+
+	if(runtime·sigsend(s))
+		return 1;
+	runtime·exit(2);	// SIGINT, SIGTERM, etc
+	return 0;
+}
+
+extern void runtime·dosigprof(Context *r, G *gp, M *mp);
+extern void runtime·profileloop(void);
+#pragma dataflag NOPTR
+static void *profiletimer;
+
+static void
+profilem(M *mp)
+{
+	extern M runtime·m0;
+	extern uint32 runtime·tls0[];
+	byte rbuf[sizeof(Context)+15];
+	Context *r;
+	void *tls;
+	G *gp;
+
+	tls = mp->tls;
+	if(mp == &runtime·m0)
+		tls = runtime·tls0;
+	gp = *(G**)tls;
+
+	// align Context to 16 bytes
+	r = (Context*)((uintptr)(&rbuf[15]) & ~15);
+	r->ContextFlags = CONTEXT_CONTROL;
+	runtime·stdcall2(runtime·GetThreadContext, (uintptr)mp->thread, (uintptr)r);
+	runtime·dosigprof(r, gp, mp);
+}
+
+void
+runtime·profileloop1(void)
+{
+	M *mp, *allm;
+	uintptr thread;
+
+	runtime·stdcall2(runtime·SetThreadPriority, -2, THREAD_PRIORITY_HIGHEST);
+
+	for(;;) {
+		runtime·stdcall2(runtime·WaitForSingleObject, (uintptr)profiletimer, -1);
+		allm = runtime·atomicloadp(&runtime·allm);
+		for(mp = allm; mp != nil; mp = mp->alllink) {
+			thread = runtime·atomicloaduintptr(&mp->thread);
+			// Do not profile threads blocked on Notes,
+			// this includes idle worker threads,
+			// idle timer thread, idle heap scavenger, etc.
+			if(thread == 0 || mp->profilehz == 0 || mp->blocked)
+				continue;
+			runtime·stdcall1(runtime·SuspendThread, (uintptr)thread);
+			if(mp->profilehz != 0 && !mp->blocked)
+				profilem(mp);
+			runtime·stdcall1(runtime·ResumeThread, (uintptr)thread);
+		}
+	}
+}
+
+void
+runtime·resetcpuprofiler(int32 hz)
+{
+	static Mutex lock;
+	void *timer, *thread;
+	int32 ms;
+	int64 due;
+
+	runtime·lock(&lock);
+	if(profiletimer == nil) {
+		timer = runtime·stdcall3(runtime·CreateWaitableTimer, (uintptr)nil, (uintptr)nil, (uintptr)nil);
+		runtime·atomicstorep(&profiletimer, timer);
+		thread = runtime·stdcall6(runtime·CreateThread,
+			(uintptr)nil, (uintptr)nil, (uintptr)runtime·profileloop, (uintptr)nil, (uintptr)nil, (uintptr)nil);
+		runtime·stdcall2(runtime·SetThreadPriority, (uintptr)thread, THREAD_PRIORITY_HIGHEST);
+		runtime·stdcall1(runtime·CloseHandle, (uintptr)thread);
+	}
+	runtime·unlock(&lock);
+
+	ms = 0;
+	due = 1LL<<63;
+	if(hz > 0) {
+		ms = 1000 / hz;
+		if(ms == 0)
+			ms = 1;
+		due = ms * -10000;
+	}
+	runtime·stdcall6(runtime·SetWaitableTimer,
+		(uintptr)profiletimer, (uintptr)&due, ms, (uintptr)nil, (uintptr)nil, (uintptr)nil);
+	runtime·atomicstore((uint32*)&g->m->profilehz, hz);
+}
+
+uintptr
+runtime·memlimit(void)
+{
+	return 0;
+}
+
+#pragma dataflag NOPTR
+int8 runtime·badsignalmsg[] = "runtime: signal received on thread not created by Go.\n";
+int32 runtime·badsignallen = sizeof runtime·badsignalmsg - 1;
+
+void
+runtime·crash(void)
+{
+	// TODO: This routine should do whatever is needed
+	// to make the Windows program abort/crash as it
+	// would if Go was not intercepting signals.
+	// On Unix the routine would remove the custom signal
+	// handler and then raise a signal (like SIGABRT).
+	// Something like that should happen here.
+	// It's okay to leave this empty for now: if crash returns
+	// the ordinary exit-after-panic happens.
+}
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
new file mode 100644
index 000000000..1528d2fd1
--- /dev/null
+++ b/src/runtime/os_windows.go
@@ -0,0 +1,58 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+type stdFunction *byte
+
+func stdcall0(fn stdFunction) uintptr
+func stdcall1(fn stdFunction, a0 uintptr) uintptr
+func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr
+func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr
+func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr
+func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr
+func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr
+func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr
+
+func asmstdcall(fn unsafe.Pointer)
+func getlasterror() uint32
+func setlasterror(err uint32)
+func usleep1(usec uint32)
+func netpollinit()
+func netpollopen(fd uintptr, pd *pollDesc) int32
+func netpollclose(fd uintptr) int32
+func netpollarm(pd *pollDesc, mode int)
+
+func os_sigpipe() {
+	gothrow("too many writes on closed pipe")
+}
+
+func sigpanic() {
+	g := getg()
+	if !canpanic(g) {
+		gothrow("unexpected signal during runtime execution")
+	}
+
+	switch uint32(g.sig) {
+	case _EXCEPTION_ACCESS_VIOLATION:
+		if g.sigcode1 < 0x1000 || g.paniconfault {
+			panicmem()
+		}
+		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		gothrow("fault")
+	case _EXCEPTION_INT_DIVIDE_BY_ZERO:
+		panicdivide()
+	case _EXCEPTION_INT_OVERFLOW:
+		panicoverflow()
+	case _EXCEPTION_FLT_DENORMAL_OPERAND,
+		_EXCEPTION_FLT_DIVIDE_BY_ZERO,
+		_EXCEPTION_FLT_INEXACT_RESULT,
+		_EXCEPTION_FLT_OVERFLOW,
+		_EXCEPTION_FLT_UNDERFLOW:
+		panicfloat()
+	}
+	gothrow("fault")
+}
diff --git a/src/runtime/os_windows.h b/src/runtime/os_windows.h
new file mode 100644
index 000000000..d5d168d77
--- /dev/null
+++ b/src/runtime/os_windows.h
@@ -0,0 +1,42 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+extern void *runtime·LoadLibrary;
+extern void *runtime·GetProcAddress;
+extern void *runtime·GetQueuedCompletionStatusEx;
+
+// Call a Windows function with stdcall conventions,
+// and switch to os stack during the call.
+void runtime·asmstdcall(void *c);
+void *runtime·stdcall0(void *fn);
+void *runtime·stdcall1(void *fn, uintptr a0);
+void *runtime·stdcall2(void *fn, uintptr a0, uintptr a1);
+void *runtime·stdcall3(void *fn, uintptr a0, uintptr a1, uintptr a2);
+void *runtime·stdcall4(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3);
+void *runtime·stdcall5(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3, uintptr a4);
+void *runtime·stdcall6(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3, uintptr a4, uintptr a5);
+void *runtime·stdcall7(void *fn, uintptr a0, uintptr a1, uintptr a2, uintptr a3, uintptr a4, uintptr a5, uintptr a6);
+
+uint32 runtime·getlasterror(void);
+void runtime·setlasterror(uint32 err);
+
+// Function to be called by windows CreateThread
+// to start new os thread.
+uint32 runtime·tstart_stdcall(M *newm);
+
+uint32 runtime·issigpanic(uint32);
+void runtime·sigpanic(void);
+uint32 runtime·ctrlhandler(uint32 type);
+
+// Windows dll function to go callback entry.
+byte *runtime·compilecallback(Eface fn, bool cleanstack);
+void *runtime·callbackasm(void);
+
+void runtime·install_exception_handler(void);
+void runtime·remove_exception_handler(void);
+
+// TODO(brainman): should not need those
+enum {
+	NSIG = 65,
+};
diff --git a/src/runtime/os_windows_386.c b/src/runtime/os_windows_386.c
new file mode 100644
index 000000000..9962f0dc2
--- /dev/null
+++ b/src/runtime/os_windows_386.c
@@ -0,0 +1,128 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+void
+runtime·dumpregs(Context *r)
+{
+	runtime·printf("eax     %x\n", r->Eax);
+	runtime·printf("ebx     %x\n", r->Ebx);
+	runtime·printf("ecx     %x\n", r->Ecx);
+	runtime·printf("edx     %x\n", r->Edx);
+	runtime·printf("edi     %x\n", r->Edi);
+	runtime·printf("esi     %x\n", r->Esi);
+	runtime·printf("ebp     %x\n", r->Ebp);
+	runtime·printf("esp     %x\n", r->Esp);
+	runtime·printf("eip     %x\n", r->Eip);
+	runtime·printf("eflags  %x\n", r->EFlags);
+	runtime·printf("cs      %x\n", r->SegCs);
+	runtime·printf("fs      %x\n", r->SegFs);
+	runtime·printf("gs      %x\n", r->SegGs);
+}
+
+bool
+runtime·isgoexception(ExceptionRecord *info, Context *r)
+{
+	extern byte runtime·text[], runtime·etext[];
+
+	// Only handle exception if executing instructions in Go binary
+	// (not Windows library code). 
+	if(r->Eip < (uint32)runtime·text || (uint32)runtime·etext < r->Eip)
+		return false;
+
+	if(!runtime·issigpanic(info->ExceptionCode))
+		return false;
+
+	return true;
+}
+
+// Called by sigtramp from Windows VEH handler.
+// Return value signals whether the exception has been handled (EXCEPTION_CONTINUE_EXECUTION)
+// or should be made available to other handlers in the chain (EXCEPTION_CONTINUE_SEARCH).
+uint32
+runtime·exceptionhandler(ExceptionRecord *info, Context *r, G *gp)
+{
+	uintptr *sp;
+
+	if(!runtime·isgoexception(info, r))
+		return EXCEPTION_CONTINUE_SEARCH;
+
+	// Make it look like a call to the signal func.
+	// Have to pass arguments out of band since
+	// augmenting the stack frame would break
+	// the unwinding code.
+	gp->sig = info->ExceptionCode;
+	gp->sigcode0 = info->ExceptionInformation[0];
+	gp->sigcode1 = info->ExceptionInformation[1];
+	gp->sigpc = r->Eip;
+
+	// Only push runtime·sigpanic if r->eip != 0.
+	// If r->eip == 0, probably panicked because of a
+	// call to a nil func.  Not pushing that onto sp will
+	// make the trace look like a call to runtime·sigpanic instead.
+	// (Otherwise the trace will end at runtime·sigpanic and we
+	// won't get to see who faulted.)
+	if(r->Eip != 0) {
+		sp = (uintptr*)r->Esp;
+		*--sp = r->Eip;
+		r->Esp = (uintptr)sp;
+	}
+	r->Eip = (uintptr)runtime·sigpanic;
+	return EXCEPTION_CONTINUE_EXECUTION;
+}
+
+// lastcontinuehandler is reached, because runtime cannot handle
+// current exception. lastcontinuehandler will print crash info and exit.
+uint32
+runtime·lastcontinuehandler(ExceptionRecord *info, Context *r, G *gp)
+{
+	bool crash;
+
+	if(runtime·panicking)	// traceback already printed
+		runtime·exit(2);
+	runtime·panicking = 1;
+
+	runtime·printf("Exception %x %p %p %p\n", info->ExceptionCode,
+		(uintptr)info->ExceptionInformation[0], (uintptr)info->ExceptionInformation[1], (uintptr)r->Eip);
+
+	runtime·printf("PC=%x\n", r->Eip);
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = g->m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)){
+		runtime·tracebacktrap(r->Eip, r->Esp, 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·dumpregs(r);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+	runtime·exit(2);
+	return 0; // not reached
+}
+
+void
+runtime·sigenable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·sigdisable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·dosigprof(Context *r, G *gp, M *mp)
+{
+	runtime·sigprof((uint8*)r->Eip, (uint8*)r->Esp, nil, gp, mp);
+}
diff --git a/src/runtime/os_windows_386.go b/src/runtime/os_windows_386.go
new file mode 100644
index 000000000..86a1906c0
--- /dev/null
+++ b/src/runtime/os_windows_386.go
@@ -0,0 +1,11 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// contextPC returns the EIP (program counter) register from the context.
+func contextPC(r *context) uintptr { return uintptr(r.eip) }
+
+// contextSP returns the ESP (stack pointer) register from the context.
+func contextSP(r *context) uintptr { return uintptr(r.esp) }
diff --git a/src/runtime/os_windows_amd64.c b/src/runtime/os_windows_amd64.c
new file mode 100644
index 000000000..e4617e4ce
--- /dev/null
+++ b/src/runtime/os_windows_amd64.c
@@ -0,0 +1,150 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+
+void
+runtime·dumpregs(Context *r)
+{
+	runtime·printf("rax     %X\n", r->Rax);
+	runtime·printf("rbx     %X\n", r->Rbx);
+	runtime·printf("rcx     %X\n", r->Rcx);
+	runtime·printf("rdx     %X\n", r->Rdx);
+	runtime·printf("rdi     %X\n", r->Rdi);
+	runtime·printf("rsi     %X\n", r->Rsi);
+	runtime·printf("rbp     %X\n", r->Rbp);
+	runtime·printf("rsp     %X\n", r->Rsp);
+	runtime·printf("r8      %X\n", r->R8 );
+	runtime·printf("r9      %X\n", r->R9 );
+	runtime·printf("r10     %X\n", r->R10);
+	runtime·printf("r11     %X\n", r->R11);
+	runtime·printf("r12     %X\n", r->R12);
+	runtime·printf("r13     %X\n", r->R13);
+	runtime·printf("r14     %X\n", r->R14);
+	runtime·printf("r15     %X\n", r->R15);
+	runtime·printf("rip     %X\n", r->Rip);
+	runtime·printf("rflags  %X\n", r->EFlags);
+	runtime·printf("cs      %X\n", (uint64)r->SegCs);
+	runtime·printf("fs      %X\n", (uint64)r->SegFs);
+	runtime·printf("gs      %X\n", (uint64)r->SegGs);
+}
+
+bool
+runtime·isgoexception(ExceptionRecord *info, Context *r)
+{
+	extern byte runtime·text[], runtime·etext[];
+
+	// Only handle exception if executing instructions in Go binary
+	// (not Windows library code). 
+	if(r->Rip < (uint64)runtime·text || (uint64)runtime·etext < r->Rip)
+		return false;
+
+	if(!runtime·issigpanic(info->ExceptionCode))
+		return false;
+
+	return true;
+}
+
+// Called by sigtramp from Windows VEH handler.
+// Return value signals whether the exception has been handled (EXCEPTION_CONTINUE_EXECUTION)
+// or should be made available to other handlers in the chain (EXCEPTION_CONTINUE_SEARCH).
+uint32
+runtime·exceptionhandler(ExceptionRecord *info, Context *r, G *gp)
+{
+	uintptr *sp;
+
+	if(!runtime·isgoexception(info, r))
+		return EXCEPTION_CONTINUE_SEARCH;
+
+	// Make it look like a call to the signal func.
+	// Have to pass arguments out of band since
+	// augmenting the stack frame would break
+	// the unwinding code.
+	gp->sig = info->ExceptionCode;
+	gp->sigcode0 = info->ExceptionInformation[0];
+	gp->sigcode1 = info->ExceptionInformation[1];
+	gp->sigpc = r->Rip;
+
+	// Only push runtime·sigpanic if r->rip != 0.
+	// If r->rip == 0, probably panicked because of a
+	// call to a nil func.  Not pushing that onto sp will
+	// make the trace look like a call to runtime·sigpanic instead.
+	// (Otherwise the trace will end at runtime·sigpanic and we
+	// won't get to see who faulted.)
+	if(r->Rip != 0) {
+		sp = (uintptr*)r->Rsp;
+		*--sp = r->Rip;
+		r->Rsp = (uintptr)sp;
+	}
+	r->Rip = (uintptr)runtime·sigpanic;
+	return EXCEPTION_CONTINUE_EXECUTION;
+}
+
+// It seems Windows searches ContinueHandler's list even
+// if ExceptionHandler returns EXCEPTION_CONTINUE_EXECUTION.
+// firstcontinuehandler will stop that search,
+// if exceptionhandler did the same earlier.
+uint32
+runtime·firstcontinuehandler(ExceptionRecord *info, Context *r, G *gp)
+{
+	USED(gp);
+	if(!runtime·isgoexception(info, r))
+		return EXCEPTION_CONTINUE_SEARCH;
+	return EXCEPTION_CONTINUE_EXECUTION;
+}
+
+// lastcontinuehandler is reached, because runtime cannot handle
+// current exception. lastcontinuehandler will print crash info and exit.
+uint32
+runtime·lastcontinuehandler(ExceptionRecord *info, Context *r, G *gp)
+{
+	bool crash;
+
+	if(runtime·panicking)	// traceback already printed
+		runtime·exit(2);
+	runtime·panicking = 1;
+
+	runtime·printf("Exception %x %p %p %p\n", info->ExceptionCode,
+		info->ExceptionInformation[0], info->ExceptionInformation[1], r->Rip);
+
+
+	runtime·printf("PC=%X\n", r->Rip);
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = g->m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)){
+		runtime·tracebacktrap(r->Rip, r->Rsp, 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·dumpregs(r);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+	runtime·exit(2);
+	return 0; // not reached
+}
+
+void
+runtime·sigenable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·sigdisable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·dosigprof(Context *r, G *gp, M *mp)
+{
+	runtime·sigprof((uint8*)r->Rip, (uint8*)r->Rsp, nil, gp, mp);
+}
diff --git a/src/runtime/os_windows_amd64.go b/src/runtime/os_windows_amd64.go
new file mode 100644
index 000000000..3f4d4d07c
--- /dev/null
+++ b/src/runtime/os_windows_amd64.go
@@ -0,0 +1,11 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// contextPC returns the RIP (program counter) register from the context.
+func contextPC(r *context) uintptr { return uintptr(r.rip) }
+
+// contextSP returns the RSP (stack pointer) register from the context.
+func contextSP(r *context) uintptr { return uintptr(r.rsp) }
diff --git a/src/runtime/panic.c b/src/runtime/panic.c
new file mode 100644
index 000000000..24eb6dbfe
--- /dev/null
+++ b/src/runtime/panic.c
@@ -0,0 +1,200 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "stack.h"
+#include "malloc.h"
+#include "textflag.h"
+
+// Code related to defer, panic and recover.
+
+// TODO: remove once code is moved to Go
+extern Defer* runtime·newdefer(int32 siz);
+extern runtime·freedefer(Defer *d);
+
+uint32 runtime·panicking;
+static Mutex paniclk;
+
+void
+runtime·deferproc_m(void)
+{
+	int32 siz;
+	FuncVal *fn;
+	uintptr argp;
+	uintptr callerpc;
+	Defer *d;
+
+	siz = g->m->scalararg[0];
+	fn = g->m->ptrarg[0];
+	argp = g->m->scalararg[1];
+	callerpc = g->m->scalararg[2];
+	g->m->ptrarg[0] = nil;
+	g->m->scalararg[1] = 0;
+
+	d = runtime·newdefer(siz);
+	if(d->panic != nil)
+		runtime·throw("deferproc: d->panic != nil after newdefer");
+	d->fn = fn;
+	d->pc = callerpc;
+	d->argp = argp;
+	runtime·memmove(d+1, (void*)argp, siz);
+}
+
+// Unwind the stack after a deferred function calls recover
+// after a panic.  Then arrange to continue running as though
+// the caller of the deferred function returned normally.
+void
+runtime·recovery_m(G *gp)
+{
+	void *argp;
+	uintptr pc;
+	
+	// Info about defer passed in G struct.
+	argp = (void*)gp->sigcode0;
+	pc = (uintptr)gp->sigcode1;
+
+	// d's arguments need to be in the stack.
+	if(argp != nil && ((uintptr)argp < gp->stack.lo || gp->stack.hi < (uintptr)argp)) {
+		runtime·printf("recover: %p not in [%p, %p]\n", argp, gp->stack.lo, gp->stack.hi);
+		runtime·throw("bad recovery");
+	}
+
+	// Make the deferproc for this d return again,
+	// this time returning 1.  The calling function will
+	// jump to the standard return epilogue.
+	// The -2*sizeof(uintptr) makes up for the
+	// two extra words that are on the stack at
+	// each call to deferproc.
+	// (The pc we're returning to does pop pop
+	// before it tests the return value.)
+	// On the arm there are 2 saved LRs mixed in too.
+	if(thechar == '5')
+		gp->sched.sp = (uintptr)argp - 4*sizeof(uintptr);
+	else
+		gp->sched.sp = (uintptr)argp - 2*sizeof(uintptr);
+	gp->sched.pc = pc;
+	gp->sched.lr = 0;
+	gp->sched.ret = 1;
+	runtime·gogo(&gp->sched);
+}
+
+void
+runtime·startpanic_m(void)
+{
+	if(runtime·mheap.cachealloc.size == 0) { // very early
+		runtime·printf("runtime: panic before malloc heap initialized\n");
+		g->m->mallocing = 1; // tell rest of panic not to try to malloc
+	} else if(g->m->mcache == nil) // can happen if called from signal handler or throw
+		g->m->mcache = runtime·allocmcache();
+	switch(g->m->dying) {
+	case 0:
+		g->m->dying = 1;
+		if(g != nil) {
+			g->writebuf.array = nil;
+			g->writebuf.len = 0;
+			g->writebuf.cap = 0;
+		}
+		runtime·xadd(&runtime·panicking, 1);
+		runtime·lock(&paniclk);
+		if(runtime·debug.schedtrace > 0 || runtime·debug.scheddetail > 0)
+			runtime·schedtrace(true);
+		runtime·freezetheworld();
+		return;
+	case 1:
+		// Something failed while panicing, probably the print of the
+		// argument to panic().  Just print a stack trace and exit.
+		g->m->dying = 2;
+		runtime·printf("panic during panic\n");
+		runtime·dopanic(0);
+		runtime·exit(3);
+	case 2:
+		// This is a genuine bug in the runtime, we couldn't even
+		// print the stack trace successfully.
+		g->m->dying = 3;
+		runtime·printf("stack trace unavailable\n");
+		runtime·exit(4);
+	default:
+		// Can't even print!  Just exit.
+		runtime·exit(5);
+	}
+}
+
+void
+runtime·dopanic_m(void)
+{
+	G *gp;
+	uintptr sp, pc;
+	static bool didothers;
+	bool crash;
+	int32 t;
+
+	gp = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	pc = g->m->scalararg[0];
+	sp = g->m->scalararg[1];
+	g->m->scalararg[1] = 0;
+	if(gp->sig != 0)
+		runtime·printf("[signal %x code=%p addr=%p pc=%p]\n",
+			gp->sig, gp->sigcode0, gp->sigcode1, gp->sigpc);
+
+	if((t = runtime·gotraceback(&crash)) > 0){
+		if(gp != gp->m->g0) {
+			runtime·printf("\n");
+			runtime·goroutineheader(gp);
+			runtime·traceback(pc, sp, 0, gp);
+		} else if(t >= 2 || g->m->throwing > 0) {
+			runtime·printf("\nruntime stack:\n");
+			runtime·traceback(pc, sp, 0, gp);
+		}
+		if(!didothers) {
+			didothers = true;
+			runtime·tracebackothers(gp);
+		}
+	}
+	runtime·unlock(&paniclk);
+	if(runtime·xadd(&runtime·panicking, -1) != 0) {
+		// Some other m is panicking too.
+		// Let it print what it needs to print.
+		// Wait forever without chewing up cpu.
+		// It will exit when it's done.
+		static Mutex deadlock;
+		runtime·lock(&deadlock);
+		runtime·lock(&deadlock);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+	runtime·exit(2);
+}
+
+#pragma textflag NOSPLIT
+bool
+runtime·canpanic(G *gp)
+{
+	M *m;
+	uint32 status;
+
+	// Note that g is m->gsignal, different from gp.
+	// Note also that g->m can change at preemption, so m can go stale
+	// if this function ever makes a function call.
+	m = g->m;
+
+	// Is it okay for gp to panic instead of crashing the program?
+	// Yes, as long as it is running Go code, not runtime code,
+	// and not stuck in a system call.
+	if(gp == nil || gp != m->curg)
+		return false;
+	if(m->locks-m->softfloat != 0 || m->mallocing != 0 || m->throwing != 0 || m->gcing != 0 || m->dying != 0)
+		return false;
+	status = runtime·readgstatus(gp);
+	if((status&~Gscan) != Grunning || gp->syscallsp != 0)
+		return false;
+#ifdef GOOS_windows
+	if(m->libcallsp != 0)
+		return false;
+#endif
+	return true;
+}
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
new file mode 100644
index 000000000..685ff5ca0
--- /dev/null
+++ b/src/runtime/panic.go
@@ -0,0 +1,505 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var indexError = error(errorString("index out of range"))
+
+func panicindex() {
+	panic(indexError)
+}
+
+var sliceError = error(errorString("slice bounds out of range"))
+
+func panicslice() {
+	panic(sliceError)
+}
+
+var divideError = error(errorString("integer divide by zero"))
+
+func panicdivide() {
+	panic(divideError)
+}
+
+var overflowError = error(errorString("integer overflow"))
+
+func panicoverflow() {
+	panic(overflowError)
+}
+
+var floatError = error(errorString("floating point error"))
+
+func panicfloat() {
+	panic(floatError)
+}
+
+var memoryError = error(errorString("invalid memory address or nil pointer dereference"))
+
+func panicmem() {
+	panic(memoryError)
+}
+
+func throwreturn() {
+	gothrow("no return at end of a typed function - compiler is broken")
+}
+
+func throwinit() {
+	gothrow("recursive call during initialization - linker skew")
+}
+
+// Create a new deferred function fn with siz bytes of arguments.
+// The compiler turns a defer statement into a call to this.
+//go:nosplit
+func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn
+	// the arguments of fn are in a perilous state.  The stack map
+	// for deferproc does not describe them.  So we can't let garbage
+	// collection or stack copying trigger until we've copied them out
+	// to somewhere safe.  deferproc_m does that.  Until deferproc_m,
+	// we can only call nosplit routines.
+	argp := uintptr(unsafe.Pointer(&fn))
+	argp += unsafe.Sizeof(fn)
+	if GOARCH == "arm" {
+		argp += ptrSize // skip caller's saved link register
+	}
+	mp := acquirem()
+	mp.scalararg[0] = uintptr(siz)
+	mp.ptrarg[0] = unsafe.Pointer(fn)
+	mp.scalararg[1] = argp
+	mp.scalararg[2] = getcallerpc(unsafe.Pointer(&siz))
+
+	if mp.curg != getg() {
+		// go code on the m stack can't defer
+		gothrow("defer on m")
+	}
+
+	onM(deferproc_m)
+
+	releasem(mp)
+
+	// deferproc returns 0 normally.
+	// a deferred func that stops a panic
+	// makes the deferproc return 1.
+	// the code the compiler generates always
+	// checks the return value and jumps to the
+	// end of the function if deferproc returns != 0.
+	return0()
+	// No code can go here - the C return register has
+	// been set and must not be clobbered.
+}
+
+// Small malloc size classes >= 16 are the multiples of 16: 16, 32, 48, 64, 80, 96, 112, 128, 144, ...
+// Each P holds a pool for defers with small arg sizes.
+// Assign defer allocations to pools by rounding to 16, to match malloc size classes.
+
+const (
+	deferHeaderSize = unsafe.Sizeof(_defer{})
+	minDeferAlloc   = (deferHeaderSize + 15) &^ 15
+	minDeferArgs    = minDeferAlloc - deferHeaderSize
+)
+
+// defer size class for arg size sz
+//go:nosplit
+func deferclass(siz uintptr) uintptr {
+	if siz <= minDeferArgs {
+		return 0
+	}
+	return (siz - minDeferArgs + 15) / 16
+}
+
+// total size of memory block for defer with arg size sz
+func totaldefersize(siz uintptr) uintptr {
+	if siz <= minDeferArgs {
+		return minDeferAlloc
+	}
+	return deferHeaderSize + siz
+}
+
+// Ensure that defer arg sizes that map to the same defer size class
+// also map to the same malloc size class.
+func testdefersizes() {
+	var m [len(p{}.deferpool)]int32
+
+	for i := range m {
+		m[i] = -1
+	}
+	for i := uintptr(0); ; i++ {
+		defersc := deferclass(i)
+		if defersc >= uintptr(len(m)) {
+			break
+		}
+		siz := goroundupsize(totaldefersize(i))
+		if m[defersc] < 0 {
+			m[defersc] = int32(siz)
+			continue
+		}
+		if m[defersc] != int32(siz) {
+			print("bad defer size class: i=", i, " siz=", siz, " defersc=", defersc, "\n")
+			gothrow("bad defer size class")
+		}
+	}
+}
+
+// The arguments associated with a deferred call are stored
+// immediately after the _defer header in memory.
+//go:nosplit
+func deferArgs(d *_defer) unsafe.Pointer {
+	return add(unsafe.Pointer(d), unsafe.Sizeof(*d))
+}
+
+var deferType *_type // type of _defer struct
+
+func init() {
+	var x interface{}
+	x = (*_defer)(nil)
+	deferType = (*(**ptrtype)(unsafe.Pointer(&x))).elem
+}
+
+// Allocate a Defer, usually using per-P pool.
+// Each defer must be released with freedefer.
+// Note: runs on M stack
+func newdefer(siz int32) *_defer {
+	var d *_defer
+	sc := deferclass(uintptr(siz))
+	mp := acquirem()
+	if sc < uintptr(len(p{}.deferpool)) {
+		pp := mp.p
+		d = pp.deferpool[sc]
+		if d != nil {
+			pp.deferpool[sc] = d.link
+		}
+	}
+	if d == nil {
+		// Allocate new defer+args.
+		total := goroundupsize(totaldefersize(uintptr(siz)))
+		d = (*_defer)(mallocgc(total, deferType, 0))
+	}
+	d.siz = siz
+	gp := mp.curg
+	d.link = gp._defer
+	gp._defer = d
+	releasem(mp)
+	return d
+}
+
+// Free the given defer.
+// The defer cannot be used after this call.
+//go:nosplit
+func freedefer(d *_defer) {
+	if d._panic != nil {
+		freedeferpanic()
+	}
+	if d.fn != nil {
+		freedeferfn()
+	}
+	sc := deferclass(uintptr(d.siz))
+	if sc < uintptr(len(p{}.deferpool)) {
+		mp := acquirem()
+		pp := mp.p
+		*d = _defer{}
+		d.link = pp.deferpool[sc]
+		pp.deferpool[sc] = d
+		releasem(mp)
+	}
+}
+
+// Separate function so that it can split stack.
+// Windows otherwise runs out of stack space.
+func freedeferpanic() {
+	// _panic must be cleared before d is unlinked from gp.
+	gothrow("freedefer with d._panic != nil")
+}
+
+func freedeferfn() {
+	// fn must be cleared before d is unlinked from gp.
+	gothrow("freedefer with d.fn != nil")
+}
+
+// Run a deferred function if there is one.
+// The compiler inserts a call to this at the end of any
+// function which calls defer.
+// If there is a deferred function, this will call runtime·jmpdefer,
+// which will jump to the deferred function such that it appears
+// to have been called by the caller of deferreturn at the point
+// just before deferreturn was called.  The effect is that deferreturn
+// is called again and again until there are no more deferred functions.
+// Cannot split the stack because we reuse the caller's frame to
+// call the deferred function.
+
+// The single argument isn't actually used - it just has its address
+// taken so it can be matched against pending defers.
+//go:nosplit
+func deferreturn(arg0 uintptr) {
+	gp := getg()
+	d := gp._defer
+	if d == nil {
+		return
+	}
+	argp := uintptr(unsafe.Pointer(&arg0))
+	if d.argp != argp {
+		return
+	}
+
+	// Moving arguments around.
+	// Do not allow preemption here, because the garbage collector
+	// won't know the form of the arguments until the jmpdefer can
+	// flip the PC over to fn.
+	mp := acquirem()
+	memmove(unsafe.Pointer(argp), deferArgs(d), uintptr(d.siz))
+	fn := d.fn
+	d.fn = nil
+	gp._defer = d.link
+	freedefer(d)
+	releasem(mp)
+	jmpdefer(fn, argp)
+}
+
+// Goexit terminates the goroutine that calls it.  No other goroutine is affected.
+// Goexit runs all deferred calls before terminating the goroutine.  Because Goexit
+// is not panic, however, any recover calls in those deferred functions will return nil.
+//
+// Calling Goexit from the main goroutine terminates that goroutine
+// without func main returning. Since func main has not returned,
+// the program continues execution of other goroutines.
+// If all other goroutines exit, the program crashes.
+func Goexit() {
+	// Run all deferred functions for the current goroutine.
+	// This code is similar to gopanic, see that implementation
+	// for detailed comments.
+	gp := getg()
+	for {
+		d := gp._defer
+		if d == nil {
+			break
+		}
+		if d.started {
+			if d._panic != nil {
+				d._panic.aborted = true
+				d._panic = nil
+			}
+			d.fn = nil
+			gp._defer = d.link
+			freedefer(d)
+			continue
+		}
+		d.started = true
+		reflectcall(unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		if gp._defer != d {
+			gothrow("bad defer entry in Goexit")
+		}
+		d._panic = nil
+		d.fn = nil
+		gp._defer = d.link
+		freedefer(d)
+		// Note: we ignore recovers here because Goexit isn't a panic
+	}
+	goexit()
+}
+
+func canpanic(*g) bool
+
+// Print all currently active panics.  Used when crashing.
+func printpanics(p *_panic) {
+	if p.link != nil {
+		printpanics(p.link)
+		print("\t")
+	}
+	print("panic: ")
+	printany(p.arg)
+	if p.recovered {
+		print(" [recovered]")
+	}
+	print("\n")
+}
+
+// The implementation of the predeclared function panic.
+func gopanic(e interface{}) {
+	gp := getg()
+	if gp.m.curg != gp {
+		gothrow("panic on m stack")
+	}
+
+	// m.softfloat is set during software floating point.
+	// It increments m.locks to avoid preemption.
+	// We moved the memory loads out, so there shouldn't be
+	// any reason for it to panic anymore.
+	if gp.m.softfloat != 0 {
+		gp.m.locks--
+		gp.m.softfloat = 0
+		gothrow("panic during softfloat")
+	}
+	if gp.m.mallocing != 0 {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		gothrow("panic during malloc")
+	}
+	if gp.m.gcing != 0 {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		gothrow("panic during gc")
+	}
+	if gp.m.locks != 0 {
+		print("panic: ")
+		printany(e)
+		print("\n")
+		gothrow("panic holding locks")
+	}
+
+	var p _panic
+	p.arg = e
+	p.link = gp._panic
+	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+
+	for {
+		d := gp._defer
+		if d == nil {
+			break
+		}
+
+		// If defer was started by earlier panic or Goexit (and, since we're back here, that triggered a new panic),
+		// take defer off list. The earlier panic or Goexit will not continue running.
+		if d.started {
+			if d._panic != nil {
+				d._panic.aborted = true
+			}
+			d._panic = nil
+			d.fn = nil
+			gp._defer = d.link
+			freedefer(d)
+			continue
+		}
+
+		// Mark defer as started, but keep on list, so that traceback
+		// can find and update the defer's argument frame if stack growth
+		// or a garbage collection hapens before reflectcall starts executing d.fn.
+		d.started = true
+
+		// Record the panic that is running the defer.
+		// If there is a new panic during the deferred call, that panic
+		// will find d in the list and will mark d._panic (this panic) aborted.
+		d._panic = (*_panic)(noescape((unsafe.Pointer)(&p)))
+
+		p.argp = unsafe.Pointer(getargp(0))
+		reflectcall(unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		p.argp = nil
+
+		// reflectcall did not panic. Remove d.
+		if gp._defer != d {
+			gothrow("bad defer entry in panic")
+		}
+		d._panic = nil
+		d.fn = nil
+		gp._defer = d.link
+
+		// trigger shrinkage to test stack copy.  See stack_test.go:TestStackPanic
+		//GC()
+
+		pc := d.pc
+		argp := unsafe.Pointer(d.argp) // must be pointer so it gets adjusted during stack copy
+		freedefer(d)
+		if p.recovered {
+			gp._panic = p.link
+			// Aborted panics are marked but remain on the g.panic list.
+			// Remove them from the list.
+			for gp._panic != nil && gp._panic.aborted {
+				gp._panic = gp._panic.link
+			}
+			if gp._panic == nil { // must be done with signal
+				gp.sig = 0
+			}
+			// Pass information about recovering frame to recovery.
+			gp.sigcode0 = uintptr(argp)
+			gp.sigcode1 = pc
+			mcall(recovery_m)
+			gothrow("recovery failed") // mcall should not return
+		}
+	}
+
+	// ran out of deferred calls - old-school panic now
+	startpanic()
+	printpanics(gp._panic)
+	dopanic(0)       // should not return
+	*(*int)(nil) = 0 // not reached
+}
+
+// getargp returns the location where the caller
+// writes outgoing function call arguments.
+//go:nosplit
+func getargp(x int) uintptr {
+	// x is an argument mainly so that we can return its address.
+	// However, we need to make the function complex enough
+	// that it won't be inlined. We always pass x = 0, so this code
+	// does nothing other than keep the compiler from thinking
+	// the function is simple enough to inline.
+	if x > 0 {
+		return getcallersp(unsafe.Pointer(&x)) * 0
+	}
+	return uintptr(noescape(unsafe.Pointer(&x)))
+}
+
+// The implementation of the predeclared function recover.
+// Cannot split the stack because it needs to reliably
+// find the stack segment of its caller.
+//
+// TODO(rsc): Once we commit to CopyStackAlways,
+// this doesn't need to be nosplit.
+//go:nosplit
+func gorecover(argp uintptr) interface{} {
+	// Must be in a function running as part of a deferred call during the panic.
+	// Must be called from the topmost function of the call
+	// (the function used in the defer statement).
+	// p.argp is the argument pointer of that topmost deferred function call.
+	// Compare against argp reported by caller.
+	// If they match, the caller is the one who can recover.
+	gp := getg()
+	p := gp._panic
+	if p != nil && !p.recovered && argp == uintptr(p.argp) {
+		p.recovered = true
+		return p.arg
+	}
+	return nil
+}
+
+//go:nosplit
+func startpanic() {
+	onM_signalok(startpanic_m)
+}
+
+//go:nosplit
+func dopanic(unused int) {
+	gp := getg()
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(gp)
+	mp.scalararg[0] = getcallerpc((unsafe.Pointer)(&unused))
+	mp.scalararg[1] = getcallersp((unsafe.Pointer)(&unused))
+	onM_signalok(dopanic_m) // should never return
+	*(*int)(nil) = 0
+}
+
+//go:nosplit
+func throw(s *byte) {
+	gp := getg()
+	if gp.m.throwing == 0 {
+		gp.m.throwing = 1
+	}
+	startpanic()
+	print("fatal error: ", gostringnocopy(s), "\n")
+	dopanic(0)
+	*(*int)(nil) = 0 // not reached
+}
+
+//go:nosplit
+func gothrow(s string) {
+	gp := getg()
+	if gp.m.throwing == 0 {
+		gp.m.throwing = 1
+	}
+	startpanic()
+	print("fatal error: ", s, "\n")
+	dopanic(0)
+	*(*int)(nil) = 0 // not reached
+}
diff --git a/src/runtime/parfor.c b/src/runtime/parfor.c
new file mode 100644
index 000000000..e44956840
--- /dev/null
+++ b/src/runtime/parfor.c
@@ -0,0 +1,226 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Parallel for algorithm.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+
+struct ParForThread
+{
+	// the thread's iteration space [32lsb, 32msb)
+	uint64 pos;
+	// stats
+	uint64 nsteal;
+	uint64 nstealcnt;
+	uint64 nprocyield;
+	uint64 nosyield;
+	uint64 nsleep;
+	byte pad[CacheLineSize];
+};
+
+void
+runtime·parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32))
+{
+	uint32 i, begin, end;
+	uint64 *pos;
+
+	if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
+		runtime·printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
+		runtime·throw("parfor: invalid args");
+	}
+
+	desc->body = body;
+	desc->done = 0;
+	desc->nthr = nthr;
+	desc->thrseq = 0;
+	desc->cnt = n;
+	desc->ctx = ctx;
+	desc->wait = wait;
+	desc->nsteal = 0;
+	desc->nstealcnt = 0;
+	desc->nprocyield = 0;
+	desc->nosyield = 0;
+	desc->nsleep = 0;
+	for(i=0; i<nthr; i++) {
+		begin = (uint64)n*i / nthr;
+		end = (uint64)n*(i+1) / nthr;
+		pos = &desc->thr[i].pos;
+		if(((uintptr)pos & 7) != 0)
+			runtime·throw("parforsetup: pos is not aligned");
+		*pos = (uint64)begin | (((uint64)end)<<32);
+	}
+}
+
+void
+runtime·parfordo(ParFor *desc)
+{
+	ParForThread *me;
+	uint32 tid, begin, end, begin2, try, victim, i;
+	uint64 *mypos, *victimpos, pos, newpos;
+	void (*body)(ParFor*, uint32);
+	bool idle;
+
+	// Obtain 0-based thread index.
+	tid = runtime·xadd(&desc->thrseq, 1) - 1;
+	if(tid >= desc->nthr) {
+		runtime·printf("tid=%d nthr=%d\n", tid, desc->nthr);
+		runtime·throw("parfor: invalid tid");
+	}
+
+	// If single-threaded, just execute the for serially.
+	if(desc->nthr==1) {
+		for(i=0; i<desc->cnt; i++)
+			desc->body(desc, i);
+		return;
+	}
+
+	body = desc->body;
+	me = &desc->thr[tid];
+	mypos = &me->pos;
+	for(;;) {
+		for(;;) {
+			// While there is local work,
+			// bump low index and execute the iteration.
+			pos = runtime·xadd64(mypos, 1);
+			begin = (uint32)pos-1;
+			end = (uint32)(pos>>32);
+			if(begin < end) {
+				body(desc, begin);
+				continue;
+			}
+			break;
+		}
+
+		// Out of work, need to steal something.
+		idle = false;
+		for(try=0;; try++) {
+			// If we don't see any work for long enough,
+			// increment the done counter...
+			if(try > desc->nthr*4 && !idle) {
+				idle = true;
+				runtime·xadd(&desc->done, 1);
+			}
+			// ...if all threads have incremented the counter,
+			// we are done.
+			if(desc->done + !idle == desc->nthr) {
+				if(!idle)
+					runtime·xadd(&desc->done, 1);
+				goto exit;
+			}
+			// Choose a random victim for stealing.
+			victim = runtime·fastrand1() % (desc->nthr-1);
+			if(victim >= tid)
+				victim++;
+			victimpos = &desc->thr[victim].pos;
+			for(;;) {
+				// See if it has any work.
+				pos = runtime·atomicload64(victimpos);
+				begin = (uint32)pos;
+				end = (uint32)(pos>>32);
+				if(begin+1 >= end) {
+					begin = end = 0;
+					break;
+				}
+				if(idle) {
+					runtime·xadd(&desc->done, -1);
+					idle = false;
+				}
+				begin2 = begin + (end-begin)/2;
+				newpos = (uint64)begin | (uint64)begin2<<32;
+				if(runtime·cas64(victimpos, pos, newpos)) {
+					begin = begin2;
+					break;
+				}
+			}
+			if(begin < end) {
+				// Has successfully stolen some work.
+				if(idle)
+					runtime·throw("parfor: should not be idle");
+				runtime·atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
+				me->nsteal++;
+				me->nstealcnt += end-begin;
+				break;
+			}
+			// Backoff.
+			if(try < desc->nthr) {
+				// nothing
+			} else if (try < 4*desc->nthr) {
+				me->nprocyield++;
+				runtime·procyield(20);
+			// If a caller asked not to wait for the others, exit now
+			// (assume that most work is already done at this point).
+			} else if (!desc->wait) {
+				if(!idle)
+					runtime·xadd(&desc->done, 1);
+				goto exit;
+			} else if (try < 6*desc->nthr) {
+				me->nosyield++;
+				runtime·osyield();
+			} else {
+				me->nsleep++;
+				runtime·usleep(1);
+			}
+		}
+	}
+exit:
+	runtime·xadd64(&desc->nsteal, me->nsteal);
+	runtime·xadd64(&desc->nstealcnt, me->nstealcnt);
+	runtime·xadd64(&desc->nprocyield, me->nprocyield);
+	runtime·xadd64(&desc->nosyield, me->nosyield);
+	runtime·xadd64(&desc->nsleep, me->nsleep);
+	me->nsteal = 0;
+	me->nstealcnt = 0;
+	me->nprocyield = 0;
+	me->nosyield = 0;
+	me->nsleep = 0;
+}
+
+// For testing from Go.
+void
+runtime·newparfor_m(void)
+{
+	g->m->ptrarg[0] = runtime·parforalloc(g->m->scalararg[0]);
+}
+
+void
+runtime·parforsetup_m(void)
+{
+	ParFor *desc;
+	void *ctx;
+	void (*body)(ParFor*, uint32);
+
+	desc = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	ctx = g->m->ptrarg[1];
+	g->m->ptrarg[1] = nil;
+	body = g->m->ptrarg[2];
+	g->m->ptrarg[2] = nil;
+
+	runtime·parforsetup(desc, g->m->scalararg[0], g->m->scalararg[1], ctx, g->m->scalararg[2], body);
+}
+
+void
+runtime·parfordo_m(void)
+{
+	ParFor *desc;
+
+	desc = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	runtime·parfordo(desc);
+}
+
+void
+runtime·parforiters_m(void)
+{
+	ParFor *desc;
+	uintptr tid;
+
+	desc = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	tid = g->m->scalararg[0];
+	g->m->scalararg[0] = desc->thr[tid].pos;
+	g->m->scalararg[1] = desc->thr[tid].pos>>32;
+}
diff --git a/src/runtime/parfor_test.go b/src/runtime/parfor_test.go
new file mode 100644
index 000000000..de64285b8
--- /dev/null
+++ b/src/runtime/parfor_test.go
@@ -0,0 +1,139 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The race detector does not understand ParFor synchronization.
+// +build !race
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+	"unsafe"
+)
+
+var gdata []uint64
+
+// Simple serial sanity test for parallelfor.
+func TestParFor(t *testing.T) {
+	const P = 1
+	const N = 20
+	data := make([]uint64, N)
+	for i := uint64(0); i < N; i++ {
+		data[i] = i
+	}
+	desc := NewParFor(P)
+	// Avoid making func a closure: parfor cannot invoke them.
+	// Since it doesn't happen in the C code, it's not worth doing
+	// just for the test.
+	gdata = data
+	ParForSetup(desc, P, N, nil, true, func(desc *ParFor, i uint32) {
+		data := gdata
+		data[i] = data[i]*data[i] + 1
+	})
+	ParForDo(desc)
+	for i := uint64(0); i < N; i++ {
+		if data[i] != i*i+1 {
+			t.Fatalf("Wrong element %d: %d", i, data[i])
+		}
+	}
+}
+
+// Test that nonblocking parallelfor does not block.
+func TestParFor2(t *testing.T) {
+	const P = 7
+	const N = 1003
+	data := make([]uint64, N)
+	for i := uint64(0); i < N; i++ {
+		data[i] = i
+	}
+	desc := NewParFor(P)
+	ParForSetup(desc, P, N, (*byte)(unsafe.Pointer(&data)), false, func(desc *ParFor, i uint32) {
+		d := *(*[]uint64)(unsafe.Pointer(desc.Ctx))
+		d[i] = d[i]*d[i] + 1
+	})
+	for p := 0; p < P; p++ {
+		ParForDo(desc)
+	}
+	for i := uint64(0); i < N; i++ {
+		if data[i] != i*i+1 {
+			t.Fatalf("Wrong element %d: %d", i, data[i])
+		}
+	}
+}
+
+// Test that iterations are properly distributed.
+func TestParForSetup(t *testing.T) {
+	const P = 11
+	const N = 101
+	desc := NewParFor(P)
+	for n := uint32(0); n < N; n++ {
+		for p := uint32(1); p <= P; p++ {
+			ParForSetup(desc, p, n, nil, true, func(desc *ParFor, i uint32) {})
+			sum := uint32(0)
+			size0 := uint32(0)
+			end0 := uint32(0)
+			for i := uint32(0); i < p; i++ {
+				begin, end := ParForIters(desc, i)
+				size := end - begin
+				sum += size
+				if i == 0 {
+					size0 = size
+					if begin != 0 {
+						t.Fatalf("incorrect begin: %d (n=%d, p=%d)", begin, n, p)
+					}
+				} else {
+					if size != size0 && size != size0+1 {
+						t.Fatalf("incorrect size: %d/%d (n=%d, p=%d)", size, size0, n, p)
+					}
+					if begin != end0 {
+						t.Fatalf("incorrect begin/end: %d/%d (n=%d, p=%d)", begin, end0, n, p)
+					}
+				}
+				end0 = end
+			}
+			if sum != n {
+				t.Fatalf("incorrect sum: %d/%d (p=%d)", sum, n, p)
+			}
+		}
+	}
+}
+
+// Test parallel parallelfor.
+func TestParForParallel(t *testing.T) {
+	N := uint64(1e7)
+	if testing.Short() {
+		N /= 10
+	}
+	data := make([]uint64, N)
+	for i := uint64(0); i < N; i++ {
+		data[i] = i
+	}
+	P := GOMAXPROCS(-1)
+	c := make(chan bool, P)
+	desc := NewParFor(uint32(P))
+	gdata = data
+	ParForSetup(desc, uint32(P), uint32(N), nil, false, func(desc *ParFor, i uint32) {
+		data := gdata
+		data[i] = data[i]*data[i] + 1
+	})
+	for p := 1; p < P; p++ {
+		go func() {
+			ParForDo(desc)
+			c <- true
+		}()
+	}
+	ParForDo(desc)
+	for p := 1; p < P; p++ {
+		<-c
+	}
+	for i := uint64(0); i < N; i++ {
+		if data[i] != i*i+1 {
+			t.Fatalf("Wrong element %d: %d", i, data[i])
+		}
+	}
+
+	data, desc = nil, nil
+	GC()
+}
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
new file mode 100644
index 000000000..ebf53dd66
--- /dev/null
+++ b/src/runtime/pprof/mprof_test.go
@@ -0,0 +1,99 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof_test
+
+import (
+	"bytes"
+	"fmt"
+	"regexp"
+	"runtime"
+	. "runtime/pprof"
+	"testing"
+	"unsafe"
+)
+
+var memSink interface{}
+
+func allocateTransient1M() {
+	for i := 0; i < 1024; i++ {
+		memSink = &struct{ x [1024]byte }{}
+	}
+}
+
+func allocateTransient2M() {
+	// prevent inlining
+	if memSink == nil {
+		panic("bad")
+	}
+	memSink = make([]byte, 2<<20)
+}
+
+type Obj32 struct {
+	link *Obj32
+	pad  [32 - unsafe.Sizeof(uintptr(0))]byte
+}
+
+var persistentMemSink *Obj32
+
+func allocatePersistent1K() {
+	for i := 0; i < 32; i++ {
+		// Can't use slice because that will introduce implicit allocations.
+		obj := &Obj32{link: persistentMemSink}
+		persistentMemSink = obj
+	}
+}
+
+var memoryProfilerRun = 0
+
+func TestMemoryProfiler(t *testing.T) {
+	// Disable sampling, otherwise it's difficult to assert anything.
+	oldRate := runtime.MemProfileRate
+	runtime.MemProfileRate = 1
+	defer func() {
+		runtime.MemProfileRate = oldRate
+	}()
+
+	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	for i := 0; i < 1024; i++ {
+		memSink = make([]byte, 1024)
+	}
+
+	// Do the interesting allocations.
+	allocateTransient1M()
+	allocateTransient2M()
+	allocatePersistent1K()
+	memSink = nil
+
+	runtime.GC() // materialize stats
+	var buf bytes.Buffer
+	if err := Lookup("heap").WriteTo(&buf, 1); err != nil {
+		t.Fatalf("failed to write heap profile: %v", err)
+	}
+
+	memoryProfilerRun++
+
+	tests := []string{
+		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:43
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:66
+`, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
+
+		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:21
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:64
+`, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
+
+		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:30
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:65
+`, memoryProfilerRun, (2<<20)*memoryProfilerRun),
+	}
+
+	for _, test := range tests {
+		if !regexp.MustCompile(test).Match(buf.Bytes()) {
+			t.Fatalf("The entry did not match:\n%v\n\nProfile:\n%v\n", test, buf.String())
+		}
+	}
+}
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
new file mode 100644
index 000000000..236de54f3
--- /dev/null
+++ b/src/runtime/pprof/pprof.go
@@ -0,0 +1,673 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package pprof writes runtime profiling data in the format expected
+// by the pprof visualization tool.
+// For more information about pprof, see
+// http://code.google.com/p/google-perftools/.
+package pprof
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"runtime"
+	"sort"
+	"strings"
+	"sync"
+	"text/tabwriter"
+)
+
+// BUG(rsc): Profiles are incomplete and inaccurate on NetBSD and OS X.
+// See http://golang.org/issue/6047 for details.
+
+// A Profile is a collection of stack traces showing the call sequences
+// that led to instances of a particular event, such as allocation.
+// Packages can create and maintain their own profiles; the most common
+// use is for tracking resources that must be explicitly closed, such as files
+// or network connections.
+//
+// A Profile's methods can be called from multiple goroutines simultaneously.
+//
+// Each Profile has a unique name.  A few profiles are predefined:
+//
+//	goroutine    - stack traces of all current goroutines
+//	heap         - a sampling of all heap allocations
+//	threadcreate - stack traces that led to the creation of new OS threads
+//	block        - stack traces that led to blocking on synchronization primitives
+//
+// These predefined profiles maintain themselves and panic on an explicit
+// Add or Remove method call.
+//
+// The CPU profile is not available as a Profile.  It has a special API,
+// the StartCPUProfile and StopCPUProfile functions, because it streams
+// output to a writer during profiling.
+//
+type Profile struct {
+	name  string
+	mu    sync.Mutex
+	m     map[interface{}][]uintptr
+	count func() int
+	write func(io.Writer, int) error
+}
+
+// profiles records all registered profiles.
+var profiles struct {
+	mu sync.Mutex
+	m  map[string]*Profile
+}
+
+var goroutineProfile = &Profile{
+	name:  "goroutine",
+	count: countGoroutine,
+	write: writeGoroutine,
+}
+
+var threadcreateProfile = &Profile{
+	name:  "threadcreate",
+	count: countThreadCreate,
+	write: writeThreadCreate,
+}
+
+var heapProfile = &Profile{
+	name:  "heap",
+	count: countHeap,
+	write: writeHeap,
+}
+
+var blockProfile = &Profile{
+	name:  "block",
+	count: countBlock,
+	write: writeBlock,
+}
+
+func lockProfiles() {
+	profiles.mu.Lock()
+	if profiles.m == nil {
+		// Initial built-in profiles.
+		profiles.m = map[string]*Profile{
+			"goroutine":    goroutineProfile,
+			"threadcreate": threadcreateProfile,
+			"heap":         heapProfile,
+			"block":        blockProfile,
+		}
+	}
+}
+
+func unlockProfiles() {
+	profiles.mu.Unlock()
+}
+
+// NewProfile creates a new profile with the given name.
+// If a profile with that name already exists, NewProfile panics.
+// The convention is to use a 'import/path.' prefix to create
+// separate name spaces for each package.
+func NewProfile(name string) *Profile {
+	lockProfiles()
+	defer unlockProfiles()
+	if name == "" {
+		panic("pprof: NewProfile with empty name")
+	}
+	if profiles.m[name] != nil {
+		panic("pprof: NewProfile name already in use: " + name)
+	}
+	p := &Profile{
+		name: name,
+		m:    map[interface{}][]uintptr{},
+	}
+	profiles.m[name] = p
+	return p
+}
+
+// Lookup returns the profile with the given name, or nil if no such profile exists.
+func Lookup(name string) *Profile {
+	lockProfiles()
+	defer unlockProfiles()
+	return profiles.m[name]
+}
+
+// Profiles returns a slice of all the known profiles, sorted by name.
+func Profiles() []*Profile {
+	lockProfiles()
+	defer unlockProfiles()
+
+	var all []*Profile
+	for _, p := range profiles.m {
+		all = append(all, p)
+	}
+
+	sort.Sort(byName(all))
+	return all
+}
+
+type byName []*Profile
+
+func (x byName) Len() int           { return len(x) }
+func (x byName) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byName) Less(i, j int) bool { return x[i].name < x[j].name }
+
+// Name returns this profile's name, which can be passed to Lookup to reobtain the profile.
+func (p *Profile) Name() string {
+	return p.name
+}
+
+// Count returns the number of execution stacks currently in the profile.
+func (p *Profile) Count() int {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if p.count != nil {
+		return p.count()
+	}
+	return len(p.m)
+}
+
+// Add adds the current execution stack to the profile, associated with value.
+// Add stores value in an internal map, so value must be suitable for use as
+// a map key and will not be garbage collected until the corresponding
+// call to Remove.  Add panics if the profile already contains a stack for value.
+//
+// The skip parameter has the same meaning as runtime.Caller's skip
+// and controls where the stack trace begins.  Passing skip=0 begins the
+// trace in the function calling Add.  For example, given this
+// execution stack:
+//
+//	Add
+//	called from rpc.NewClient
+//	called from mypkg.Run
+//	called from main.main
+//
+// Passing skip=0 begins the stack trace at the call to Add inside rpc.NewClient.
+// Passing skip=1 begins the stack trace at the call to NewClient inside mypkg.Run.
+//
+func (p *Profile) Add(value interface{}, skip int) {
+	if p.name == "" {
+		panic("pprof: use of uninitialized Profile")
+	}
+	if p.write != nil {
+		panic("pprof: Add called on built-in Profile " + p.name)
+	}
+
+	stk := make([]uintptr, 32)
+	n := runtime.Callers(skip+1, stk[:])
+
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	if p.m[value] != nil {
+		panic("pprof: Profile.Add of duplicate value")
+	}
+	p.m[value] = stk[:n]
+}
+
+// Remove removes the execution stack associated with value from the profile.
+// It is a no-op if the value is not in the profile.
+func (p *Profile) Remove(value interface{}) {
+	p.mu.Lock()
+	defer p.mu.Unlock()
+	delete(p.m, value)
+}
+
+// WriteTo writes a pprof-formatted snapshot of the profile to w.
+// If a write to w returns an error, WriteTo returns that error.
+// Otherwise, WriteTo returns nil.
+//
+// The debug parameter enables additional output.
+// Passing debug=0 prints only the hexadecimal addresses that pprof needs.
+// Passing debug=1 adds comments translating addresses to function names
+// and line numbers, so that a programmer can read the profile without tools.
+//
+// The predefined profiles may assign meaning to other debug values;
+// for example, when printing the "goroutine" profile, debug=2 means to
+// print the goroutine stacks in the same form that a Go program uses
+// when dying due to an unrecovered panic.
+func (p *Profile) WriteTo(w io.Writer, debug int) error {
+	if p.name == "" {
+		panic("pprof: use of zero Profile")
+	}
+	if p.write != nil {
+		return p.write(w, debug)
+	}
+
+	// Obtain consistent snapshot under lock; then process without lock.
+	var all [][]uintptr
+	p.mu.Lock()
+	for _, stk := range p.m {
+		all = append(all, stk)
+	}
+	p.mu.Unlock()
+
+	// Map order is non-deterministic; make output deterministic.
+	sort.Sort(stackProfile(all))
+
+	return printCountProfile(w, debug, p.name, stackProfile(all))
+}
+
+type stackProfile [][]uintptr
+
+func (x stackProfile) Len() int              { return len(x) }
+func (x stackProfile) Stack(i int) []uintptr { return x[i] }
+func (x stackProfile) Swap(i, j int)         { x[i], x[j] = x[j], x[i] }
+func (x stackProfile) Less(i, j int) bool {
+	t, u := x[i], x[j]
+	for k := 0; k < len(t) && k < len(u); k++ {
+		if t[k] != u[k] {
+			return t[k] < u[k]
+		}
+	}
+	return len(t) < len(u)
+}
+
+// A countProfile is a set of stack traces to be printed as counts
+// grouped by stack trace.  There are multiple implementations:
+// all that matters is that we can find out how many traces there are
+// and obtain each trace in turn.
+type countProfile interface {
+	Len() int
+	Stack(i int) []uintptr
+}
+
+// printCountProfile prints a countProfile at the specified debug level.
+func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
+	b := bufio.NewWriter(w)
+	var tw *tabwriter.Writer
+	w = b
+	if debug > 0 {
+		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+		w = tw
+	}
+
+	fmt.Fprintf(w, "%s profile: total %d\n", name, p.Len())
+
+	// Build count of each stack.
+	var buf bytes.Buffer
+	key := func(stk []uintptr) string {
+		buf.Reset()
+		fmt.Fprintf(&buf, "@")
+		for _, pc := range stk {
+			fmt.Fprintf(&buf, " %#x", pc)
+		}
+		return buf.String()
+	}
+	m := map[string]int{}
+	n := p.Len()
+	for i := 0; i < n; i++ {
+		m[key(p.Stack(i))]++
+	}
+
+	// Print stacks, listing count on first occurrence of a unique stack.
+	for i := 0; i < n; i++ {
+		stk := p.Stack(i)
+		s := key(stk)
+		if count := m[s]; count != 0 {
+			fmt.Fprintf(w, "%d %s\n", count, s)
+			if debug > 0 {
+				printStackRecord(w, stk, false)
+			}
+			delete(m, s)
+		}
+	}
+
+	if tw != nil {
+		tw.Flush()
+	}
+	return b.Flush()
+}
+
+// printStackRecord prints the function + source line information
+// for a single stack trace.
+func printStackRecord(w io.Writer, stk []uintptr, allFrames bool) {
+	show := allFrames
+	wasPanic := false
+	for i, pc := range stk {
+		f := runtime.FuncForPC(pc)
+		if f == nil {
+			show = true
+			fmt.Fprintf(w, "#\t%#x\n", pc)
+			wasPanic = false
+		} else {
+			tracepc := pc
+			// Back up to call instruction.
+			if i > 0 && pc > f.Entry() && !wasPanic {
+				if runtime.GOARCH == "386" || runtime.GOARCH == "amd64" {
+					tracepc--
+				} else {
+					tracepc -= 4 // arm, etc
+				}
+			}
+			file, line := f.FileLine(tracepc)
+			name := f.Name()
+			// Hide runtime.goexit and any runtime functions at the beginning.
+			// This is useful mainly for allocation traces.
+			wasPanic = name == "runtime.panic"
+			if name == "runtime.goexit" || !show && strings.HasPrefix(name, "runtime.") {
+				continue
+			}
+			show = true
+			fmt.Fprintf(w, "#\t%#x\t%s+%#x\t%s:%d\n", pc, name, pc-f.Entry(), file, line)
+		}
+	}
+	if !show {
+		// We didn't print anything; do it again,
+		// and this time include runtime functions.
+		printStackRecord(w, stk, true)
+		return
+	}
+	fmt.Fprintf(w, "\n")
+}
+
+// Interface to system profiles.
+
+type byInUseBytes []runtime.MemProfileRecord
+
+func (x byInUseBytes) Len() int           { return len(x) }
+func (x byInUseBytes) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byInUseBytes) Less(i, j int) bool { return x[i].InUseBytes() > x[j].InUseBytes() }
+
+// WriteHeapProfile is shorthand for Lookup("heap").WriteTo(w, 0).
+// It is preserved for backwards compatibility.
+func WriteHeapProfile(w io.Writer) error {
+	return writeHeap(w, 0)
+}
+
+// countHeap returns the number of records in the heap profile.
+func countHeap() int {
+	n, _ := runtime.MemProfile(nil, true)
+	return n
+}
+
+// writeHeap writes the current runtime heap profile to w.
+func writeHeap(w io.Writer, debug int) error {
+	// Find out how many records there are (MemProfile(nil, true)),
+	// allocate that many records, and get the data.
+	// There's a race—more records might be added between
+	// the two calls—so allocate a few extra records for safety
+	// and also try again if we're very unlucky.
+	// The loop should only execute one iteration in the common case.
+	var p []runtime.MemProfileRecord
+	n, ok := runtime.MemProfile(nil, true)
+	for {
+		// Allocate room for a slightly bigger profile,
+		// in case a few more entries have been added
+		// since the call to MemProfile.
+		p = make([]runtime.MemProfileRecord, n+50)
+		n, ok = runtime.MemProfile(p, true)
+		if ok {
+			p = p[0:n]
+			break
+		}
+		// Profile grew; try again.
+	}
+
+	sort.Sort(byInUseBytes(p))
+
+	b := bufio.NewWriter(w)
+	var tw *tabwriter.Writer
+	w = b
+	if debug > 0 {
+		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+		w = tw
+	}
+
+	var total runtime.MemProfileRecord
+	for i := range p {
+		r := &p[i]
+		total.AllocBytes += r.AllocBytes
+		total.AllocObjects += r.AllocObjects
+		total.FreeBytes += r.FreeBytes
+		total.FreeObjects += r.FreeObjects
+	}
+
+	// Technically the rate is MemProfileRate not 2*MemProfileRate,
+	// but early versions of the C++ heap profiler reported 2*MemProfileRate,
+	// so that's what pprof has come to expect.
+	fmt.Fprintf(w, "heap profile: %d: %d [%d: %d] @ heap/%d\n",
+		total.InUseObjects(), total.InUseBytes(),
+		total.AllocObjects, total.AllocBytes,
+		2*runtime.MemProfileRate)
+
+	for i := range p {
+		r := &p[i]
+		fmt.Fprintf(w, "%d: %d [%d: %d] @",
+			r.InUseObjects(), r.InUseBytes(),
+			r.AllocObjects, r.AllocBytes)
+		for _, pc := range r.Stack() {
+			fmt.Fprintf(w, " %#x", pc)
+		}
+		fmt.Fprintf(w, "\n")
+		if debug > 0 {
+			printStackRecord(w, r.Stack(), false)
+		}
+	}
+
+	// Print memstats information too.
+	// Pprof will ignore, but useful for people
+	if debug > 0 {
+		s := new(runtime.MemStats)
+		runtime.ReadMemStats(s)
+		fmt.Fprintf(w, "\n# runtime.MemStats\n")
+		fmt.Fprintf(w, "# Alloc = %d\n", s.Alloc)
+		fmt.Fprintf(w, "# TotalAlloc = %d\n", s.TotalAlloc)
+		fmt.Fprintf(w, "# Sys = %d\n", s.Sys)
+		fmt.Fprintf(w, "# Lookups = %d\n", s.Lookups)
+		fmt.Fprintf(w, "# Mallocs = %d\n", s.Mallocs)
+		fmt.Fprintf(w, "# Frees = %d\n", s.Frees)
+
+		fmt.Fprintf(w, "# HeapAlloc = %d\n", s.HeapAlloc)
+		fmt.Fprintf(w, "# HeapSys = %d\n", s.HeapSys)
+		fmt.Fprintf(w, "# HeapIdle = %d\n", s.HeapIdle)
+		fmt.Fprintf(w, "# HeapInuse = %d\n", s.HeapInuse)
+		fmt.Fprintf(w, "# HeapReleased = %d\n", s.HeapReleased)
+		fmt.Fprintf(w, "# HeapObjects = %d\n", s.HeapObjects)
+
+		fmt.Fprintf(w, "# Stack = %d / %d\n", s.StackInuse, s.StackSys)
+		fmt.Fprintf(w, "# MSpan = %d / %d\n", s.MSpanInuse, s.MSpanSys)
+		fmt.Fprintf(w, "# MCache = %d / %d\n", s.MCacheInuse, s.MCacheSys)
+		fmt.Fprintf(w, "# BuckHashSys = %d\n", s.BuckHashSys)
+
+		fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
+		fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
+		fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
+		fmt.Fprintf(w, "# EnableGC = %v\n", s.EnableGC)
+		fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
+	}
+
+	if tw != nil {
+		tw.Flush()
+	}
+	return b.Flush()
+}
+
+// countThreadCreate returns the size of the current ThreadCreateProfile.
+func countThreadCreate() int {
+	n, _ := runtime.ThreadCreateProfile(nil)
+	return n
+}
+
+// writeThreadCreate writes the current runtime ThreadCreateProfile to w.
+func writeThreadCreate(w io.Writer, debug int) error {
+	return writeRuntimeProfile(w, debug, "threadcreate", runtime.ThreadCreateProfile)
+}
+
+// countGoroutine returns the number of goroutines.
+func countGoroutine() int {
+	return runtime.NumGoroutine()
+}
+
+// writeGoroutine writes the current runtime GoroutineProfile to w.
+func writeGoroutine(w io.Writer, debug int) error {
+	if debug >= 2 {
+		return writeGoroutineStacks(w)
+	}
+	return writeRuntimeProfile(w, debug, "goroutine", runtime.GoroutineProfile)
+}
+
+func writeGoroutineStacks(w io.Writer) error {
+	// We don't know how big the buffer needs to be to collect
+	// all the goroutines.  Start with 1 MB and try a few times, doubling each time.
+	// Give up and use a truncated trace if 64 MB is not enough.
+	buf := make([]byte, 1<<20)
+	for i := 0; ; i++ {
+		n := runtime.Stack(buf, true)
+		if n < len(buf) {
+			buf = buf[:n]
+			break
+		}
+		if len(buf) >= 64<<20 {
+			// Filled 64 MB - stop there.
+			break
+		}
+		buf = make([]byte, 2*len(buf))
+	}
+	_, err := w.Write(buf)
+	return err
+}
+
+func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runtime.StackRecord) (int, bool)) error {
+	// Find out how many records there are (fetch(nil)),
+	// allocate that many records, and get the data.
+	// There's a race—more records might be added between
+	// the two calls—so allocate a few extra records for safety
+	// and also try again if we're very unlucky.
+	// The loop should only execute one iteration in the common case.
+	var p []runtime.StackRecord
+	n, ok := fetch(nil)
+	for {
+		// Allocate room for a slightly bigger profile,
+		// in case a few more entries have been added
+		// since the call to ThreadProfile.
+		p = make([]runtime.StackRecord, n+10)
+		n, ok = fetch(p)
+		if ok {
+			p = p[0:n]
+			break
+		}
+		// Profile grew; try again.
+	}
+
+	return printCountProfile(w, debug, name, runtimeProfile(p))
+}
+
+type runtimeProfile []runtime.StackRecord
+
+func (p runtimeProfile) Len() int              { return len(p) }
+func (p runtimeProfile) Stack(i int) []uintptr { return p[i].Stack() }
+
+var cpu struct {
+	sync.Mutex
+	profiling bool
+	done      chan bool
+}
+
+// StartCPUProfile enables CPU profiling for the current process.
+// While profiling, the profile will be buffered and written to w.
+// StartCPUProfile returns an error if profiling is already enabled.
+func StartCPUProfile(w io.Writer) error {
+	// The runtime routines allow a variable profiling rate,
+	// but in practice operating systems cannot trigger signals
+	// at more than about 500 Hz, and our processing of the
+	// signal is not cheap (mostly getting the stack trace).
+	// 100 Hz is a reasonable choice: it is frequent enough to
+	// produce useful data, rare enough not to bog down the
+	// system, and a nice round number to make it easy to
+	// convert sample counts to seconds.  Instead of requiring
+	// each client to specify the frequency, we hard code it.
+	const hz = 100
+
+	cpu.Lock()
+	defer cpu.Unlock()
+	if cpu.done == nil {
+		cpu.done = make(chan bool)
+	}
+	// Double-check.
+	if cpu.profiling {
+		return fmt.Errorf("cpu profiling already in use")
+	}
+	cpu.profiling = true
+	runtime.SetCPUProfileRate(hz)
+	go profileWriter(w)
+	return nil
+}
+
+func profileWriter(w io.Writer) {
+	for {
+		data := runtime.CPUProfile()
+		if data == nil {
+			break
+		}
+		w.Write(data)
+	}
+	cpu.done <- true
+}
+
+// StopCPUProfile stops the current CPU profile, if any.
+// StopCPUProfile only returns after all the writes for the
+// profile have completed.
+func StopCPUProfile() {
+	cpu.Lock()
+	defer cpu.Unlock()
+
+	if !cpu.profiling {
+		return
+	}
+	cpu.profiling = false
+	runtime.SetCPUProfileRate(0)
+	<-cpu.done
+}
+
+type byCycles []runtime.BlockProfileRecord
+
+func (x byCycles) Len() int           { return len(x) }
+func (x byCycles) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byCycles) Less(i, j int) bool { return x[i].Cycles > x[j].Cycles }
+
+// countBlock returns the number of records in the blocking profile.
+func countBlock() int {
+	n, _ := runtime.BlockProfile(nil)
+	return n
+}
+
+// writeBlock writes the current blocking profile to w.
+func writeBlock(w io.Writer, debug int) error {
+	var p []runtime.BlockProfileRecord
+	n, ok := runtime.BlockProfile(nil)
+	for {
+		p = make([]runtime.BlockProfileRecord, n+50)
+		n, ok = runtime.BlockProfile(p)
+		if ok {
+			p = p[:n]
+			break
+		}
+	}
+
+	sort.Sort(byCycles(p))
+
+	b := bufio.NewWriter(w)
+	var tw *tabwriter.Writer
+	w = b
+	if debug > 0 {
+		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+		w = tw
+	}
+
+	fmt.Fprintf(w, "--- contention:\n")
+	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
+	for i := range p {
+		r := &p[i]
+		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
+		for _, pc := range r.Stack() {
+			fmt.Fprintf(w, " %#x", pc)
+		}
+		fmt.Fprint(w, "\n")
+		if debug > 0 {
+			printStackRecord(w, r.Stack(), true)
+		}
+	}
+
+	if tw != nil {
+		tw.Flush()
+	}
+	return b.Flush()
+}
+
+func runtime_cyclesPerSecond() int64
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
new file mode 100644
index 000000000..8677cb30c
--- /dev/null
+++ b/src/runtime/pprof/pprof_test.go
@@ -0,0 +1,452 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !nacl
+
+package pprof_test
+
+import (
+	"bytes"
+	"fmt"
+	"math/big"
+	"os/exec"
+	"regexp"
+	"runtime"
+	. "runtime/pprof"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func cpuHogger(f func()) {
+	// We only need to get one 100 Hz clock tick, so we've got
+	// a 25x safety buffer.
+	// But do at least 500 iterations (which should take about 100ms),
+	// otherwise TestCPUProfileMultithreaded can fail if only one
+	// thread is scheduled during the 250ms period.
+	t0 := time.Now()
+	for i := 0; i < 500 || time.Since(t0) < 250*time.Millisecond; i++ {
+		f()
+	}
+}
+
+var (
+	salt1 = 0
+	salt2 = 0
+)
+
+// The actual CPU hogging function.
+// Must not call other functions nor access heap/globals in the loop,
+// otherwise under race detector the samples will be in the race runtime.
+func cpuHog1() {
+	foo := salt1
+	for i := 0; i < 1e5; i++ {
+		if foo > 0 {
+			foo *= foo
+		} else {
+			foo *= foo + 1
+		}
+	}
+	salt1 = foo
+}
+
+func cpuHog2() {
+	foo := salt2
+	for i := 0; i < 1e5; i++ {
+		if foo > 0 {
+			foo *= foo
+		} else {
+			foo *= foo + 2
+		}
+	}
+	salt2 = foo
+}
+
+func TestCPUProfile(t *testing.T) {
+	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func() {
+		cpuHogger(cpuHog1)
+	})
+}
+
+func TestCPUProfileMultithreaded(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func() {
+		c := make(chan int)
+		go func() {
+			cpuHogger(cpuHog1)
+			c <- 1
+		}()
+		cpuHogger(cpuHog2)
+		<-c
+	})
+}
+
+func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
+	// Convert []byte to []uintptr.
+	l := len(bytes) / int(unsafe.Sizeof(uintptr(0)))
+	val := *(*[]uintptr)(unsafe.Pointer(&bytes))
+	val = val[:l]
+
+	// 5 for the header, 2 for the per-sample header on at least one sample, 3 for the trailer.
+	if l < 5+2+3 {
+		t.Logf("profile too short: %#x", val)
+		if badOS[runtime.GOOS] {
+			t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
+			return
+		}
+		t.FailNow()
+	}
+
+	hd, val, tl := val[:5], val[5:l-3], val[l-3:]
+	if hd[0] != 0 || hd[1] != 3 || hd[2] != 0 || hd[3] != 1e6/100 || hd[4] != 0 {
+		t.Fatalf("unexpected header %#x", hd)
+	}
+
+	if tl[0] != 0 || tl[1] != 1 || tl[2] != 0 {
+		t.Fatalf("malformed end-of-data marker %#x", tl)
+	}
+
+	for len(val) > 0 {
+		if len(val) < 2 || val[0] < 1 || val[1] < 1 || uintptr(len(val)) < 2+val[1] {
+			t.Fatalf("malformed profile.  leftover: %#x", val)
+		}
+		f(val[0], val[2:2+val[1]])
+		val = val[2+val[1]:]
+	}
+}
+
+func testCPUProfile(t *testing.T, need []string, f func()) {
+	switch runtime.GOOS {
+	case "darwin":
+		out, err := exec.Command("uname", "-a").CombinedOutput()
+		if err != nil {
+			t.Fatal(err)
+		}
+		vers := string(out)
+		t.Logf("uname -a: %v", vers)
+	case "plan9":
+		// unimplemented
+		return
+	}
+
+	var prof bytes.Buffer
+	if err := StartCPUProfile(&prof); err != nil {
+		t.Fatal(err)
+	}
+	f()
+	StopCPUProfile()
+
+	// Check that profile is well formed and contains need.
+	have := make([]uintptr, len(need))
+	parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+		for _, pc := range stk {
+			f := runtime.FuncForPC(pc)
+			if f == nil {
+				continue
+			}
+			for i, name := range need {
+				if strings.Contains(f.Name(), name) {
+					have[i] += count
+				}
+			}
+		}
+	})
+
+	if len(need) == 0 {
+		return
+	}
+
+	var total uintptr
+	for i, name := range need {
+		total += have[i]
+		t.Logf("%s: %d\n", name, have[i])
+	}
+	ok := true
+	if total == 0 {
+		t.Logf("no CPU profile samples collected")
+		ok = false
+	}
+	// We'd like to check a reasonable minimum, like
+	// total / len(have) / smallconstant, but this test is
+	// pretty flaky (see bug 7095).  So we'll just test to
+	// make sure we got at least one sample.
+	min := uintptr(1)
+	for i, name := range need {
+		if have[i] < min {
+			t.Logf("%s has %d samples out of %d, want at least %d, ideally %d", name, have[i], total, min, total/uintptr(len(have)))
+			ok = false
+		}
+	}
+
+	if !ok {
+		if badOS[runtime.GOOS] {
+			t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
+			return
+		}
+		t.FailNow()
+	}
+}
+
+func TestCPUProfileWithFork(t *testing.T) {
+	// Fork can hang if preempted with signals frequently enough (see issue 5517).
+	// Ensure that we do not do this.
+	heap := 1 << 30
+	if testing.Short() {
+		heap = 100 << 20
+	}
+	// This makes fork slower.
+	garbage := make([]byte, heap)
+	// Need to touch the slice, otherwise it won't be paged in.
+	done := make(chan bool)
+	go func() {
+		for i := range garbage {
+			garbage[i] = 42
+		}
+		done <- true
+	}()
+	<-done
+
+	var prof bytes.Buffer
+	if err := StartCPUProfile(&prof); err != nil {
+		t.Fatal(err)
+	}
+	defer StopCPUProfile()
+
+	for i := 0; i < 10; i++ {
+		exec.Command("go").CombinedOutput()
+	}
+}
+
+// Test that profiler does not observe runtime.gogo as "user" goroutine execution.
+// If it did, it would see inconsistent state and would either record an incorrect stack
+// or crash because the stack was malformed.
+func TestGoroutineSwitch(t *testing.T) {
+	// How much to try. These defaults take about 1 seconds
+	// on a 2012 MacBook Pro. The ones in short mode take
+	// about 0.1 seconds.
+	tries := 10
+	count := 1000000
+	if testing.Short() {
+		tries = 1
+	}
+	for try := 0; try < tries; try++ {
+		var prof bytes.Buffer
+		if err := StartCPUProfile(&prof); err != nil {
+			t.Fatal(err)
+		}
+		for i := 0; i < count; i++ {
+			runtime.Gosched()
+		}
+		StopCPUProfile()
+
+		// Read profile to look for entries for runtime.gogo with an attempt at a traceback.
+		// The special entry
+		parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+			// An entry with two frames with 'System' in its top frame
+			// exists to record a PC without a traceback. Those are okay.
+			if len(stk) == 2 {
+				f := runtime.FuncForPC(stk[1])
+				if f != nil && (f.Name() == "System" || f.Name() == "ExternalCode" || f.Name() == "GC") {
+					return
+				}
+			}
+
+			// Otherwise, should not see runtime.gogo.
+			// The place we'd see it would be the inner most frame.
+			f := runtime.FuncForPC(stk[0])
+			if f != nil && f.Name() == "runtime.gogo" {
+				var buf bytes.Buffer
+				for _, pc := range stk {
+					f := runtime.FuncForPC(pc)
+					if f == nil {
+						fmt.Fprintf(&buf, "%#x ?:0\n", pc)
+					} else {
+						file, line := f.FileLine(pc)
+						fmt.Fprintf(&buf, "%#x %s:%d\n", pc, file, line)
+					}
+				}
+				t.Fatalf("found profile entry for runtime.gogo:\n%s", buf.String())
+			}
+		})
+	}
+}
+
+// Test that profiling of division operations is okay, especially on ARM. See issue 6681.
+func TestMathBigDivide(t *testing.T) {
+	testCPUProfile(t, nil, func() {
+		t := time.After(5 * time.Second)
+		pi := new(big.Int)
+		for {
+			for i := 0; i < 100; i++ {
+				n := big.NewInt(2646693125139304345)
+				d := big.NewInt(842468587426513207)
+				pi.Div(n, d)
+			}
+			select {
+			case <-t:
+				return
+			default:
+			}
+		}
+	})
+}
+
+// Operating systems that are expected to fail the tests. See issue 6047.
+var badOS = map[string]bool{
+	"darwin": true,
+	"netbsd": true,
+	"plan9":  true,
+}
+
+func TestBlockProfile(t *testing.T) {
+	type TestCase struct {
+		name string
+		f    func()
+		re   string
+	}
+	tests := [...]TestCase{
+		{"chan recv", blockChanRecv, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanRecv\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+		{"chan send", blockChanSend, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime\.chansend1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanSend\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+		{"chan close", blockChanClose, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanClose\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+		{"select recv async", blockSelectRecvAsync, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectRecvAsync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+		{"select send sync", blockSelectSendSync, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectSendSync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+		{"mutex", blockMutex, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9,a-f]+	.*/src/sync/mutex\.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockMutex\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+		{"cond", blockCond, `
+[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9,a-f]+	.*/src/sync/cond\.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.blockCond\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+`},
+	}
+
+	runtime.SetBlockProfileRate(1)
+	defer runtime.SetBlockProfileRate(0)
+	for _, test := range tests {
+		test.f()
+	}
+	var w bytes.Buffer
+	Lookup("block").WriteTo(&w, 1)
+	prof := w.String()
+
+	if !strings.HasPrefix(prof, "--- contention:\ncycles/second=") {
+		t.Fatalf("Bad profile header:\n%v", prof)
+	}
+
+	for _, test := range tests {
+		if !regexp.MustCompile(test.re).MatchString(prof) {
+			t.Fatalf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
+		}
+	}
+}
+
+const blockDelay = 10 * time.Millisecond
+
+func blockChanRecv() {
+	c := make(chan bool)
+	go func() {
+		time.Sleep(blockDelay)
+		c <- true
+	}()
+	<-c
+}
+
+func blockChanSend() {
+	c := make(chan bool)
+	go func() {
+		time.Sleep(blockDelay)
+		<-c
+	}()
+	c <- true
+}
+
+func blockChanClose() {
+	c := make(chan bool)
+	go func() {
+		time.Sleep(blockDelay)
+		close(c)
+	}()
+	<-c
+}
+
+func blockSelectRecvAsync() {
+	c := make(chan bool, 1)
+	c2 := make(chan bool, 1)
+	go func() {
+		time.Sleep(blockDelay)
+		c <- true
+	}()
+	select {
+	case <-c:
+	case <-c2:
+	}
+}
+
+func blockSelectSendSync() {
+	c := make(chan bool)
+	c2 := make(chan bool)
+	go func() {
+		time.Sleep(blockDelay)
+		<-c
+	}()
+	select {
+	case c <- true:
+	case c2 <- true:
+	}
+}
+
+func blockMutex() {
+	var mu sync.Mutex
+	mu.Lock()
+	go func() {
+		time.Sleep(blockDelay)
+		mu.Unlock()
+	}()
+	mu.Lock()
+}
+
+func blockCond() {
+	var mu sync.Mutex
+	c := sync.NewCond(&mu)
+	mu.Lock()
+	go func() {
+		time.Sleep(blockDelay)
+		mu.Lock()
+		c.Signal()
+		mu.Unlock()
+	}()
+	c.Wait()
+	mu.Unlock()
+}
diff --git a/src/runtime/print1.go b/src/runtime/print1.go
new file mode 100644
index 000000000..8f8268873
--- /dev/null
+++ b/src/runtime/print1.go
@@ -0,0 +1,323 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// The compiler knows that a print of a value of this type
+// should use printhex instead of printuint (decimal).
+type hex uint64
+
+func bytes(s string) (ret []byte) {
+	rp := (*slice)(unsafe.Pointer(&ret))
+	sp := (*_string)(noescape(unsafe.Pointer(&s)))
+	rp.array = sp.str
+	rp.len = uint(sp.len)
+	rp.cap = uint(sp.len)
+	return
+}
+
+// printf is only called from C code. It has no type information for the args,
+// but C stacks are ignored by the garbage collector anyway, so having
+// type information would not add anything.
+//go:nosplit
+func printf(s *byte) {
+	vprintf(gostringnocopy(s), add(unsafe.Pointer(&s), unsafe.Sizeof(s)))
+}
+
+// sprintf is only called from C code. It has no type information for the args,
+// but C stacks are ignored by the garbage collector anyway, so having
+// type information would not add anything.
+//go:nosplit
+func snprintf(dst *byte, n int32, s *byte) {
+	buf := (*[1 << 30]byte)(unsafe.Pointer(dst))[0:n:n]
+
+	gp := getg()
+	gp.writebuf = buf[0:0 : n-1] // leave room for NUL, this is called from C
+	vprintf(gostringnocopy(s), add(unsafe.Pointer(&s), unsafe.Sizeof(s)))
+	buf[len(gp.writebuf)] = '\x00'
+	gp.writebuf = nil
+}
+
+//var debuglock mutex
+
+// write to goroutine-local buffer if diverting output,
+// or else standard error.
+func gwrite(b []byte) {
+	if len(b) == 0 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.writebuf == nil {
+		write(2, unsafe.Pointer(&b[0]), int32(len(b)))
+		return
+	}
+
+	n := copy(gp.writebuf[len(gp.writebuf):cap(gp.writebuf)], b)
+	gp.writebuf = gp.writebuf[:len(gp.writebuf)+n]
+}
+
+func prints(s *byte) {
+	b := (*[1 << 30]byte)(unsafe.Pointer(s))
+	for i := 0; ; i++ {
+		if b[i] == 0 {
+			gwrite(b[:i])
+			return
+		}
+	}
+}
+
+func printsp() {
+	print(" ")
+}
+
+func printnl() {
+	print("\n")
+}
+
+// Very simple printf.  Only for debugging prints.
+// Do not add to this without checking with Rob.
+func vprintf(str string, arg unsafe.Pointer) {
+	//lock(&debuglock);
+
+	s := bytes(str)
+	start := 0
+	i := 0
+	for ; i < len(s); i++ {
+		if s[i] != '%' {
+			continue
+		}
+		if i > start {
+			gwrite(s[start:i])
+		}
+		if i++; i >= len(s) {
+			break
+		}
+		var siz uintptr
+		switch s[i] {
+		case 't', 'c':
+			siz = 1
+		case 'd', 'x': // 32-bit
+			arg = roundup(arg, 4)
+			siz = 4
+		case 'D', 'U', 'X', 'f': // 64-bit
+			arg = roundup(arg, unsafe.Sizeof(uintreg(0)))
+			siz = 8
+		case 'C':
+			arg = roundup(arg, unsafe.Sizeof(uintreg(0)))
+			siz = 16
+		case 'p', 's': // pointer-sized
+			arg = roundup(arg, unsafe.Sizeof(uintptr(0)))
+			siz = unsafe.Sizeof(uintptr(0))
+		case 'S': // pointer-aligned but bigger
+			arg = roundup(arg, unsafe.Sizeof(uintptr(0)))
+			siz = unsafe.Sizeof(string(""))
+		case 'a': // pointer-aligned but bigger
+			arg = roundup(arg, unsafe.Sizeof(uintptr(0)))
+			siz = unsafe.Sizeof([]byte{})
+		case 'i', 'e': // pointer-aligned but bigger
+			arg = roundup(arg, unsafe.Sizeof(uintptr(0)))
+			siz = unsafe.Sizeof(interface{}(nil))
+		}
+		switch s[i] {
+		case 'a':
+			printslice(*(*[]byte)(arg))
+		case 'c':
+			printbyte(*(*byte)(arg))
+		case 'd':
+			printint(int64(*(*int32)(arg)))
+		case 'D':
+			printint(int64(*(*int64)(arg)))
+		case 'e':
+			printeface(*(*interface{})(arg))
+		case 'f':
+			printfloat(*(*float64)(arg))
+		case 'C':
+			printcomplex(*(*complex128)(arg))
+		case 'i':
+			printiface(*(*fInterface)(arg))
+		case 'p':
+			printpointer(*(*unsafe.Pointer)(arg))
+		case 's':
+			prints(*(**byte)(arg))
+		case 'S':
+			printstring(*(*string)(arg))
+		case 't':
+			printbool(*(*bool)(arg))
+		case 'U':
+			printuint(*(*uint64)(arg))
+		case 'x':
+			printhex(uint64(*(*uint32)(arg)))
+		case 'X':
+			printhex(*(*uint64)(arg))
+		}
+		arg = add(arg, siz)
+		start = i + 1
+	}
+	if start < i {
+		gwrite(s[start:i])
+	}
+
+	//unlock(&debuglock);
+}
+
+func printpc(p unsafe.Pointer) {
+	print("PC=", hex(uintptr(p)))
+}
+
+func printbool(v bool) {
+	if v {
+		print("true")
+	} else {
+		print("false")
+	}
+}
+
+func printbyte(c byte) {
+	gwrite((*[1]byte)(unsafe.Pointer(&c))[:])
+}
+
+func printfloat(v float64) {
+	switch {
+	case v != v:
+		print("NaN")
+		return
+	case v+v == v && v > 0:
+		print("+Inf")
+		return
+	case v+v == v && v < 0:
+		print("-Inf")
+		return
+	}
+
+	const n = 7 // digits printed
+	var buf [n + 7]byte
+	buf[0] = '+'
+	e := 0 // exp
+	if v == 0 {
+		if 1/v < 0 {
+			buf[0] = '-'
+		}
+	} else {
+		if v < 0 {
+			v = -v
+			buf[0] = '-'
+		}
+
+		// normalize
+		for v >= 10 {
+			e++
+			v /= 10
+		}
+		for v < 1 {
+			e--
+			v *= 10
+		}
+
+		// round
+		h := 5.0
+		for i := 0; i < n; i++ {
+			h /= 10
+		}
+		v += h
+		if v >= 10 {
+			e++
+			v /= 10
+		}
+	}
+
+	// format +d.dddd+edd
+	for i := 0; i < n; i++ {
+		s := int(v)
+		buf[i+2] = byte(s + '0')
+		v -= float64(s)
+		v *= 10
+	}
+	buf[1] = buf[2]
+	buf[2] = '.'
+
+	buf[n+2] = 'e'
+	buf[n+3] = '+'
+	if e < 0 {
+		e = -e
+		buf[n+3] = '-'
+	}
+
+	buf[n+4] = byte(e/100) + '0'
+	buf[n+5] = byte(e/10)%10 + '0'
+	buf[n+6] = byte(e%10) + '0'
+	gwrite(buf[:])
+}
+
+func printcomplex(c complex128) {
+	print("(", real(c), imag(c), "i)")
+}
+
+func printuint(v uint64) {
+	var buf [100]byte
+	i := len(buf)
+	for i--; i > 0; i-- {
+		buf[i] = byte(v%10 + '0')
+		if v < 10 {
+			break
+		}
+		v /= 10
+	}
+	gwrite(buf[i:])
+}
+
+func printint(v int64) {
+	if v < 0 {
+		print("-")
+		v = -v
+	}
+	printuint(uint64(v))
+}
+
+func printhex(v uint64) {
+	const dig = "0123456789abcdef"
+	var buf [100]byte
+	i := len(buf)
+	for i--; i > 0; i-- {
+		buf[i] = dig[v%16]
+		if v < 16 {
+			break
+		}
+		v /= 16
+	}
+	i--
+	buf[i] = 'x'
+	i--
+	buf[i] = '0'
+	gwrite(buf[i:])
+}
+
+func printpointer(p unsafe.Pointer) {
+	printhex(uint64(uintptr(p)))
+}
+
+func printstring(s string) {
+	if uintptr(len(s)) > maxstring {
+		gwrite(bytes("[string too long]"))
+		return
+	}
+	gwrite(bytes(s))
+}
+
+func printslice(s []byte) {
+	sp := (*slice)(unsafe.Pointer(&s))
+	print("[", len(s), "/", cap(s), "]")
+	printpointer(unsafe.Pointer(sp.array))
+}
+
+func printeface(e interface{}) {
+	ep := (*eface)(unsafe.Pointer(&e))
+	print("(", ep._type, ",", ep.data, ")")
+}
+
+func printiface(i fInterface) {
+	ip := (*iface)(unsafe.Pointer(&i))
+	print("(", ip.tab, ",", ip.data, ")")
+}
diff --git a/src/runtime/proc.c b/src/runtime/proc.c
new file mode 100644
index 000000000..8462c4b1d
--- /dev/null
+++ b/src/runtime/proc.c
@@ -0,0 +1,3521 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "zaexperiment.h"
+#include "malloc.h"
+#include "stack.h"
+#include "race.h"
+#include "type.h"
+#include "mgc0.h"
+#include "textflag.h"
+
+// Goroutine scheduler
+// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+//
+// The main concepts are:
+// G - goroutine.
+// M - worker thread, or machine.
+// P - processor, a resource that is required to execute Go code.
+//     M must have an associated P to execute Go code, however it can be
+//     blocked or in a syscall w/o an associated P.
+//
+// Design doc at http://golang.org/s/go11sched.
+
+enum
+{
+	// Number of goroutine ids to grab from runtime·sched.goidgen to local per-P cache at once.
+	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+	GoidCacheBatch = 16,
+};
+
+SchedT	runtime·sched;
+int32	runtime·gomaxprocs;
+uint32	runtime·needextram;
+bool	runtime·iscgo;
+M	runtime·m0;
+G	runtime·g0;	// idle goroutine for m0
+G*	runtime·lastg;
+M*	runtime·allm;
+M*	runtime·extram;
+P*	runtime·allp[MaxGomaxprocs+1];
+int8*	runtime·goos;
+int32	runtime·ncpu;
+int32	runtime·newprocs;
+
+Mutex runtime·allglock;	// the following vars are protected by this lock or by stoptheworld
+G**	runtime·allg;
+Slice	runtime·allgs;
+uintptr runtime·allglen;
+ForceGCState	runtime·forcegc;
+
+void runtime·mstart(void);
+static void runqput(P*, G*);
+static G* runqget(P*);
+static bool runqputslow(P*, G*, uint32, uint32);
+static G* runqsteal(P*, P*);
+static void mput(M*);
+static M* mget(void);
+static void mcommoninit(M*);
+static void schedule(void);
+static void procresize(int32);
+static void acquirep(P*);
+static P* releasep(void);
+static void newm(void(*)(void), P*);
+static void stopm(void);
+static void startm(P*, bool);
+static void handoffp(P*);
+static void wakep(void);
+static void stoplockedm(void);
+static void startlockedm(G*);
+static void sysmon(void);
+static uint32 retake(int64);
+static void incidlelocked(int32);
+static void checkdead(void);
+static void exitsyscall0(G*);
+void runtime·park_m(G*);
+static void goexit0(G*);
+static void gfput(P*, G*);
+static G* gfget(P*);
+static void gfpurge(P*);
+static void globrunqput(G*);
+static void globrunqputbatch(G*, G*, int32);
+static G* globrunqget(P*, int32);
+static P* pidleget(void);
+static void pidleput(P*);
+static void injectglist(G*);
+static bool preemptall(void);
+static bool preemptone(P*);
+static bool exitsyscallfast(void);
+static bool haveexperiment(int8*);
+void runtime·allgadd(G*);
+static void dropg(void);
+
+extern String runtime·buildVersion;
+
+// For cgo-using programs with external linking,
+// export "main" (defined in assembly) so that libc can handle basic
+// C runtime startup and call the Go program as if it were
+// the C main function.
+#pragma cgo_export_static main
+
+// Filled in by dynamic linker when Cgo is available.
+void (*_cgo_init)(void);
+void (*_cgo_malloc)(void);
+void (*_cgo_free)(void);
+
+// Copy for Go code.
+void* runtime·cgoMalloc;
+void* runtime·cgoFree;
+
+// The bootstrap sequence is:
+//
+//	call osinit
+//	call schedinit
+//	make & queue new G
+//	call runtime·mstart
+//
+// The new G calls runtime·main.
+void
+runtime·schedinit(void)
+{
+	int32 n, procs;
+	byte *p;
+
+	// raceinit must be the first call to race detector.
+	// In particular, it must be done before mallocinit below calls racemapshadow.
+	if(raceenabled)
+		g->racectx = runtime·raceinit();
+
+	runtime·sched.maxmcount = 10000;
+
+	runtime·tracebackinit();
+	runtime·symtabinit();
+	runtime·stackinit();
+	runtime·mallocinit();
+	mcommoninit(g->m);
+	
+	runtime·goargs();
+	runtime·goenvs();
+	runtime·parsedebugvars();
+	runtime·gcinit();
+
+	runtime·sched.lastpoll = runtime·nanotime();
+	procs = 1;
+	p = runtime·getenv("GOMAXPROCS");
+	if(p != nil && (n = runtime·atoi(p)) > 0) {
+		if(n > MaxGomaxprocs)
+			n = MaxGomaxprocs;
+		procs = n;
+	}
+	procresize(procs);
+
+	if(runtime·buildVersion.str == nil) {
+		// Condition should never trigger.  This code just serves
+		// to ensure runtime·buildVersion is kept in the resulting binary.
+		runtime·buildVersion.str = (uint8*)"unknown";
+		runtime·buildVersion.len = 7;
+	}
+
+	runtime·cgoMalloc = _cgo_malloc;
+	runtime·cgoFree = _cgo_free;
+}
+
+void
+runtime·newsysmon(void)
+{
+	newm(sysmon, nil);
+}
+
+static void
+dumpgstatus(G* gp)
+{
+	runtime·printf("runtime: gp: gp=%p, goid=%D, gp->atomicstatus=%x\n", gp, gp->goid, runtime·readgstatus(gp));
+	runtime·printf("runtime:  g:  g=%p, goid=%D,  g->atomicstatus=%x\n", g, g->goid, runtime·readgstatus(g));
+}
+
+static void
+checkmcount(void)
+{
+	// sched lock is held
+	if(runtime·sched.mcount > runtime·sched.maxmcount){
+		runtime·printf("runtime: program exceeds %d-thread limit\n", runtime·sched.maxmcount);
+		runtime·throw("thread exhaustion");
+	}
+}
+
+static void
+mcommoninit(M *mp)
+{
+	// g0 stack won't make sense for user (and is not necessary unwindable).
+	if(g != g->m->g0)
+		runtime·callers(1, mp->createstack, nelem(mp->createstack));
+
+	mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
+
+	runtime·lock(&runtime·sched.lock);
+	mp->id = runtime·sched.mcount++;
+	checkmcount();
+	runtime·mpreinit(mp);
+	if(mp->gsignal)
+		mp->gsignal->stackguard1 = mp->gsignal->stack.lo + StackGuard;
+
+	// Add to runtime·allm so garbage collector doesn't free g->m
+	// when it is just in a register or thread-local storage.
+	mp->alllink = runtime·allm;
+	// runtime·NumCgoCall() iterates over allm w/o schedlock,
+	// so we need to publish it safely.
+	runtime·atomicstorep(&runtime·allm, mp);
+	runtime·unlock(&runtime·sched.lock);
+}
+
+// Mark gp ready to run.
+void
+runtime·ready(G *gp)
+{
+	uint32 status;
+
+	status = runtime·readgstatus(gp);
+	// Mark runnable.
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
+	if((status&~Gscan) != Gwaiting){
+		dumpgstatus(gp);
+		runtime·throw("bad g->status in ready");
+	}
+	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+	runtime·casgstatus(gp, Gwaiting, Grunnable);
+	runqput(g->m->p, gp);
+	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0)  // TODO: fast atomic
+		wakep();
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+		g->stackguard0 = StackPreempt;
+}
+
+void
+runtime·ready_m(void)
+{
+	G *gp;
+
+	gp = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	runtime·ready(gp);
+}
+
+int32
+runtime·gcprocs(void)
+{
+	int32 n;
+
+	// Figure out how many CPUs to use during GC.
+	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+	runtime·lock(&runtime·sched.lock);
+	n = runtime·gomaxprocs;
+	if(n > runtime·ncpu)
+		n = runtime·ncpu;
+	if(n > MaxGcproc)
+		n = MaxGcproc;
+	if(n > runtime·sched.nmidle+1) // one M is currently running
+		n = runtime·sched.nmidle+1;
+	runtime·unlock(&runtime·sched.lock);
+	return n;
+}
+
+static bool
+needaddgcproc(void)
+{
+	int32 n;
+
+	runtime·lock(&runtime·sched.lock);
+	n = runtime·gomaxprocs;
+	if(n > runtime·ncpu)
+		n = runtime·ncpu;
+	if(n > MaxGcproc)
+		n = MaxGcproc;
+	n -= runtime·sched.nmidle+1; // one M is currently running
+	runtime·unlock(&runtime·sched.lock);
+	return n > 0;
+}
+
+void
+runtime·helpgc(int32 nproc)
+{
+	M *mp;
+	int32 n, pos;
+
+	runtime·lock(&runtime·sched.lock);
+	pos = 0;
+	for(n = 1; n < nproc; n++) {  // one M is currently running
+		if(runtime·allp[pos]->mcache == g->m->mcache)
+			pos++;
+		mp = mget();
+		if(mp == nil)
+			runtime·throw("runtime·gcprocs inconsistency");
+		mp->helpgc = n;
+		mp->mcache = runtime·allp[pos]->mcache;
+		pos++;
+		runtime·notewakeup(&mp->park);
+	}
+	runtime·unlock(&runtime·sched.lock);
+}
+
+// Similar to stoptheworld but best-effort and can be called several times.
+// There is no reverse operation, used during crashing.
+// This function must not lock any mutexes.
+void
+runtime·freezetheworld(void)
+{
+	int32 i;
+
+	if(runtime·gomaxprocs == 1)
+		return;
+	// stopwait and preemption requests can be lost
+	// due to races with concurrently executing threads,
+	// so try several times
+	for(i = 0; i < 5; i++) {
+		// this should tell the scheduler to not start any new goroutines
+		runtime·sched.stopwait = 0x7fffffff;
+		runtime·atomicstore((uint32*)&runtime·sched.gcwaiting, 1);
+		// this should stop running goroutines
+		if(!preemptall())
+			break;  // no running goroutines
+		runtime·usleep(1000);
+	}
+	// to be sure
+	runtime·usleep(1000);
+	preemptall();
+	runtime·usleep(1000);
+}
+
+static bool
+isscanstatus(uint32 status)
+{
+	if(status == Gscan)
+		runtime·throw("isscanstatus: Bad status Gscan");
+	return (status&Gscan) == Gscan;
+}
+
+// All reads and writes of g's status go through readgstatus, casgstatus
+// castogscanstatus, casfromgscanstatus.
+#pragma textflag NOSPLIT
+uint32
+runtime·readgstatus(G *gp)
+{
+	return runtime·atomicload(&gp->atomicstatus);
+}
+
+// The Gscanstatuses are acting like locks and this releases them.
+// If it proves to be a performance hit we should be able to make these
+// simple atomic stores but for now we are going to throw if
+// we see an inconsistent state.
+void
+runtime·casfromgscanstatus(G *gp, uint32 oldval, uint32 newval)
+{
+	bool success = false;
+
+	// Check that transition is valid.
+	switch(oldval) {
+	case Gscanrunnable:
+	case Gscanwaiting:
+	case Gscanrunning:
+	case Gscansyscall:
+		if(newval == (oldval&~Gscan))
+			success = runtime·cas(&gp->atomicstatus, oldval, newval);
+		break;
+	case Gscanenqueue:
+		if(newval == Gwaiting)
+			success = runtime·cas(&gp->atomicstatus, oldval, newval);
+		break;
+	}	
+	if(!success){
+		runtime·printf("runtime: casfromgscanstatus failed gp=%p, oldval=%d, newval=%d\n",  
+			gp, oldval, newval);
+		dumpgstatus(gp);
+		runtime·throw("casfromgscanstatus: gp->status is not in scan state");
+	}
+}
+
+// This will return false if the gp is not in the expected status and the cas fails. 
+// This acts like a lock acquire while the casfromgstatus acts like a lock release.
+bool
+runtime·castogscanstatus(G *gp, uint32 oldval, uint32 newval)
+{
+	switch(oldval) {
+	case Grunnable:
+	case Gwaiting:
+	case Gsyscall:
+		if(newval == (oldval|Gscan))
+			return runtime·cas(&gp->atomicstatus, oldval, newval);
+		break;
+	case Grunning:
+		if(newval == Gscanrunning || newval == Gscanenqueue)
+			return runtime·cas(&gp->atomicstatus, oldval, newval);
+		break;   
+	}
+
+	runtime·printf("runtime: castogscanstatus oldval=%d newval=%d\n", oldval, newval);
+	runtime·throw("castogscanstatus");
+	return false; // not reached
+}
+
+static void badcasgstatus(void);
+static void helpcasgstatus(void);
+static void badgstatusrunnable(void);
+
+// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
+// and casfromgscanstatus instead.
+// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that 
+// put it in the Gscan state is finished.
+#pragma textflag NOSPLIT
+void
+runtime·casgstatus(G *gp, uint32 oldval, uint32 newval)
+{
+	void (*fn)(void);
+
+	if((oldval&Gscan) || (newval&Gscan) || oldval == newval) {
+		g->m->scalararg[0] = oldval;
+		g->m->scalararg[1] = newval;
+		fn = badcasgstatus;
+		runtime·onM(&fn);
+	}
+
+	// loop if gp->atomicstatus is in a scan state giving
+	// GC time to finish and change the state to oldval.
+	while(!runtime·cas(&gp->atomicstatus, oldval, newval)) {
+		if(oldval == Gwaiting && gp->atomicstatus == Grunnable) {
+			fn = badgstatusrunnable;
+			runtime·onM(&fn);
+		}
+		// Help GC if needed. 
+		if(gp->preemptscan && !gp->gcworkdone && (oldval == Grunning || oldval == Gsyscall)) {
+			gp->preemptscan = false;
+			g->m->ptrarg[0] = gp;
+			fn = helpcasgstatus;
+			runtime·onM(&fn);
+		}
+	}	
+}
+
+static void
+badgstatusrunnable(void)
+{
+	runtime·throw("casgstatus: waiting for Gwaiting but is Grunnable");
+}
+
+// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
+// Returns old status. Cannot call casgstatus directly, because we are racing with an
+// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
+// it might have become Grunnable by the time we get to the cas. If we called casgstatus,
+// it would loop waiting for the status to go back to Gwaiting, which it never will.
+#pragma textflag NOSPLIT
+uint32
+runtime·casgcopystack(G *gp)
+{
+	uint32 oldstatus;
+
+	for(;;) {
+		oldstatus = runtime·readgstatus(gp) & ~Gscan;
+		if(oldstatus != Gwaiting && oldstatus != Grunnable)
+			runtime·throw("copystack: bad status, not Gwaiting or Grunnable");
+		if(runtime·cas(&gp->atomicstatus, oldstatus, Gcopystack))
+			break;
+	}
+	return oldstatus;
+}
+
+static void
+badcasgstatus(void)
+{
+	uint32 oldval, newval;
+	
+	oldval = g->m->scalararg[0];
+	newval = g->m->scalararg[1];
+	g->m->scalararg[0] = 0;
+	g->m->scalararg[1] = 0;
+
+	runtime·printf("casgstatus: oldval=%d, newval=%d\n", oldval, newval);
+	runtime·throw("casgstatus: bad incoming values");
+}
+
+static void
+helpcasgstatus(void)
+{
+	G *gp;
+	
+	gp = g->m->ptrarg[0];
+	g->m->ptrarg[0] = 0;
+	runtime·gcphasework(gp);
+}
+
+// stopg ensures that gp is stopped at a GC safe point where its stack can be scanned
+// or in the context of a moving collector the pointers can be flipped from pointing 
+// to old object to pointing to new objects. 
+// If stopg returns true, the caller knows gp is at a GC safe point and will remain there until
+// the caller calls restartg.
+// If stopg returns false, the caller is not responsible for calling restartg. This can happen
+// if another thread, either the gp itself or another GC thread is taking the responsibility 
+// to do the GC work related to this thread.
+bool
+runtime·stopg(G *gp)
+{
+	uint32 s;
+
+	for(;;) {
+		if(gp->gcworkdone)
+			return false;
+
+		s = runtime·readgstatus(gp);
+		switch(s) {
+		default:
+			dumpgstatus(gp);
+			runtime·throw("stopg: gp->atomicstatus is not valid");
+
+		case Gdead:
+			return false;
+
+		case Gcopystack:
+			// Loop until a new stack is in place.
+			break;
+
+		case Grunnable:
+		case Gsyscall:
+		case Gwaiting:
+			// Claim goroutine by setting scan bit.
+			if(!runtime·castogscanstatus(gp, s, s|Gscan))
+				break;
+			// In scan state, do work.
+			runtime·gcphasework(gp);
+			return true;
+
+		case Gscanrunnable:
+		case Gscanwaiting:
+		case Gscansyscall:
+			// Goroutine already claimed by another GC helper.
+			return false;
+
+		case Grunning:
+			// Claim goroutine, so we aren't racing with a status
+			// transition away from Grunning.
+			if(!runtime·castogscanstatus(gp, Grunning, Gscanrunning))
+				break;
+
+			// Mark gp for preemption.
+			if(!gp->gcworkdone) {
+				gp->preemptscan = true;
+				gp->preempt = true;
+				gp->stackguard0 = StackPreempt;
+			}
+
+			// Unclaim.
+			runtime·casfromgscanstatus(gp, Gscanrunning, Grunning);
+			return false;
+		}
+	}
+	// Should not be here....
+}
+
+// The GC requests that this routine be moved from a scanmumble state to a mumble state.
+void 
+runtime·restartg (G *gp)
+{
+	uint32 s;
+
+	s = runtime·readgstatus(gp);
+	switch(s) {
+	default:
+		dumpgstatus(gp); 
+		runtime·throw("restartg: unexpected status");
+
+	case Gdead:
+		break;
+
+	case Gscanrunnable:
+	case Gscanwaiting:
+	case Gscansyscall:
+		runtime·casfromgscanstatus(gp, s, s&~Gscan);
+		break;
+
+	case Gscanenqueue:
+		// Scan is now completed.
+		// Goroutine now needs to be made runnable.
+		// We put it on the global run queue; ready blocks on the global scheduler lock.
+		runtime·casfromgscanstatus(gp, Gscanenqueue, Gwaiting);
+		if(gp != g->m->curg)
+			runtime·throw("processing Gscanenqueue on wrong m");
+		dropg();
+		runtime·ready(gp);
+		break;
+	}
+}
+
+static void
+stopscanstart(G* gp)
+{
+	if(g == gp)
+		runtime·throw("GC not moved to G0");
+	if(runtime·stopg(gp)) {
+		if(!isscanstatus(runtime·readgstatus(gp))) {
+			dumpgstatus(gp);
+			runtime·throw("GC not in scan state");
+		}
+		runtime·restartg(gp);
+	}
+}
+
+// Runs on g0 and does the actual work after putting the g back on the run queue.
+static void
+mquiesce(G *gpmaster)
+{
+	G* gp;
+	uint32 i;
+	uint32 status;
+	uint32 activeglen;
+
+	activeglen = runtime·allglen;
+	// enqueue the calling goroutine.
+	runtime·restartg(gpmaster);
+	for(i = 0; i < activeglen; i++) {
+		gp = runtime·allg[i];
+		if(runtime·readgstatus(gp) == Gdead) 
+			gp->gcworkdone = true; // noop scan.
+		else 
+			gp->gcworkdone = false; 
+		stopscanstart(gp); 
+	}
+
+	// Check that the G's gcwork (such as scanning) has been done. If not do it now. 
+	// You can end up doing work here if the page trap on a Grunning Goroutine has
+	// not been sprung or in some race situations. For example a runnable goes dead
+	// and is started up again with a gp->gcworkdone set to false.
+	for(i = 0; i < activeglen; i++) {
+		gp = runtime·allg[i];
+		while (!gp->gcworkdone) {
+			status = runtime·readgstatus(gp);
+			if(status == Gdead) {
+				gp->gcworkdone = true; // scan is a noop
+				break;
+				//do nothing, scan not needed. 
+			}
+			if(status == Grunning && gp->stackguard0 == (uintptr)StackPreempt && runtime·notetsleep(&runtime·sched.stopnote, 100*1000)) // nanosecond arg 
+				runtime·noteclear(&runtime·sched.stopnote);
+			else 
+				stopscanstart(gp);
+		}
+	}
+
+	for(i = 0; i < activeglen; i++) {
+		gp = runtime·allg[i];
+		status = runtime·readgstatus(gp);
+		if(isscanstatus(status)) {
+			runtime·printf("mstopandscang:bottom: post scan bad status gp=%p has status %x\n", gp, status);
+			dumpgstatus(gp);
+		}
+		if(!gp->gcworkdone && status != Gdead) {
+			runtime·printf("mstopandscang:bottom: post scan gp=%p->gcworkdone still false\n", gp);
+			dumpgstatus(gp);
+		}
+	}
+
+	schedule(); // Never returns.
+}
+
+// quiesce moves all the goroutines to a GC safepoint which for now is a at preemption point.
+// If the global runtime·gcphase is GCmark quiesce will ensure that all of the goroutine's stacks
+// have been scanned before it returns.
+void
+runtime·quiesce(G* mastergp)
+{
+	void (*fn)(G*);
+
+	runtime·castogscanstatus(mastergp, Grunning, Gscanenqueue);
+	// Now move this to the g0 (aka m) stack.
+	// g0 will potentially scan this thread and put mastergp on the runqueue 
+	fn = mquiesce;
+	runtime·mcall(&fn);
+}
+
+// This is used by the GC as well as the routines that do stack dumps. In the case
+// of GC all the routines can be reliably stopped. This is not always the case
+// when the system is in panic or being exited.
+void
+runtime·stoptheworld(void)
+{
+	int32 i;
+	uint32 s;
+	P *p;
+	bool wait;
+
+	// If we hold a lock, then we won't be able to stop another M
+	// that is blocked trying to acquire the lock.
+	if(g->m->locks > 0)
+		runtime·throw("stoptheworld: holding locks");
+
+	runtime·lock(&runtime·sched.lock);
+	runtime·sched.stopwait = runtime·gomaxprocs;
+	runtime·atomicstore((uint32*)&runtime·sched.gcwaiting, 1);
+	preemptall();
+	// stop current P
+	g->m->p->status = Pgcstop; // Pgcstop is only diagnostic.
+	runtime·sched.stopwait--;
+	// try to retake all P's in Psyscall status
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		s = p->status;
+		if(s == Psyscall && runtime·cas(&p->status, s, Pgcstop))
+			runtime·sched.stopwait--;
+	}
+	// stop idle P's
+	while(p = pidleget()) {
+		p->status = Pgcstop;
+		runtime·sched.stopwait--;
+	}
+	wait = runtime·sched.stopwait > 0;
+	runtime·unlock(&runtime·sched.lock);
+
+	// wait for remaining P's to stop voluntarily
+	if(wait) {
+		for(;;) {
+			// wait for 100us, then try to re-preempt in case of any races
+			if(runtime·notetsleep(&runtime·sched.stopnote, 100*1000)) {
+				runtime·noteclear(&runtime·sched.stopnote);
+				break;
+			}
+			preemptall();
+		}
+	}
+	if(runtime·sched.stopwait)
+		runtime·throw("stoptheworld: not stopped");
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p->status != Pgcstop)
+			runtime·throw("stoptheworld: not stopped");
+	}
+}
+
+static void
+mhelpgc(void)
+{
+	g->m->helpgc = -1;
+}
+
+void
+runtime·starttheworld(void)
+{
+	P *p, *p1;
+	M *mp;
+	G *gp;
+	bool add;
+
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
+	gp = runtime·netpoll(false);  // non-blocking
+	injectglist(gp);
+	add = needaddgcproc();
+	runtime·lock(&runtime·sched.lock);
+	if(runtime·newprocs) {
+		procresize(runtime·newprocs);
+		runtime·newprocs = 0;
+	} else
+		procresize(runtime·gomaxprocs);
+	runtime·sched.gcwaiting = 0;
+
+	p1 = nil;
+	while(p = pidleget()) {
+		// procresize() puts p's with work at the beginning of the list.
+		// Once we reach a p without a run queue, the rest don't have one either.
+		if(p->runqhead == p->runqtail) {
+			pidleput(p);
+			break;
+		}
+		p->m = mget();
+		p->link = p1;
+		p1 = p;
+	}
+	if(runtime·sched.sysmonwait) {
+		runtime·sched.sysmonwait = false;
+		runtime·notewakeup(&runtime·sched.sysmonnote);
+	}
+	runtime·unlock(&runtime·sched.lock);
+
+	while(p1) {
+		p = p1;
+		p1 = p1->link;
+		if(p->m) {
+			mp = p->m;
+			p->m = nil;
+			if(mp->nextp)
+				runtime·throw("starttheworld: inconsistent mp->nextp");
+			mp->nextp = p;
+			runtime·notewakeup(&mp->park);
+		} else {
+			// Start M to run P.  Do not start another M below.
+			newm(nil, p);
+			add = false;
+		}
+	}
+
+	if(add) {
+		// If GC could have used another helper proc, start one now,
+		// in the hope that it will be available next time.
+		// It would have been even better to start it before the collection,
+		// but doing so requires allocating memory, so it's tricky to
+		// coordinate.  This lazy approach works out in practice:
+		// we don't mind if the first couple gc rounds don't have quite
+		// the maximum number of procs.
+		newm(mhelpgc, nil);
+	}
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+		g->stackguard0 = StackPreempt;
+}
+
+static void mstart(void);
+
+// Called to start an M.
+#pragma textflag NOSPLIT
+void
+runtime·mstart(void)
+{
+	uintptr x, size;
+	
+	if(g->stack.lo == 0) {
+		// Initialize stack bounds from system stack.
+		// Cgo may have left stack size in stack.hi.
+		size = g->stack.hi;
+		if(size == 0)
+			size = 8192;
+		g->stack.hi = (uintptr)&x;
+		g->stack.lo = g->stack.hi - size + 1024;
+	}
+	
+	// Initialize stack guards so that we can start calling
+	// both Go and C functions with stack growth prologues.
+	g->stackguard0 = g->stack.lo + StackGuard;
+	g->stackguard1 = g->stackguard0;
+	mstart();
+}
+
+static void
+mstart(void)
+{
+	if(g != g->m->g0)
+		runtime·throw("bad runtime·mstart");
+
+	// Record top of stack for use by mcall.
+	// Once we call schedule we're never coming back,
+	// so other calls can reuse this stack space.
+	runtime·gosave(&g->m->g0->sched);
+	g->m->g0->sched.pc = (uintptr)-1;  // make sure it is never used
+	runtime·asminit();
+	runtime·minit();
+
+	// Install signal handlers; after minit so that minit can
+	// prepare the thread to be able to handle the signals.
+	if(g->m == &runtime·m0)
+		runtime·initsig();
+	
+	if(g->m->mstartfn)
+		g->m->mstartfn();
+
+	if(g->m->helpgc) {
+		g->m->helpgc = 0;
+		stopm();
+	} else if(g->m != &runtime·m0) {
+		acquirep(g->m->nextp);
+		g->m->nextp = nil;
+	}
+	schedule();
+
+	// TODO(brainman): This point is never reached, because scheduler
+	// does not release os threads at the moment. But once this path
+	// is enabled, we must remove our seh here.
+}
+
+// When running with cgo, we call _cgo_thread_start
+// to start threads for us so that we can play nicely with
+// foreign code.
+void (*_cgo_thread_start)(void*);
+
+typedef struct CgoThreadStart CgoThreadStart;
+struct CgoThreadStart
+{
+	G *g;
+	uintptr *tls;
+	void (*fn)(void);
+};
+
+M *runtime·newM(void); // in proc.go
+
+// Allocate a new m unassociated with any thread.
+// Can use p for allocation context if needed.
+M*
+runtime·allocm(P *p)
+{
+	M *mp;
+
+	g->m->locks++;  // disable GC because it can be called from sysmon
+	if(g->m->p == nil)
+		acquirep(p);  // temporarily borrow p for mallocs in this function
+	mp = runtime·newM();
+	mcommoninit(mp);
+
+	// In case of cgo or Solaris, pthread_create will make us a stack.
+	// Windows and Plan 9 will layout sched stack on OS stack.
+	if(runtime·iscgo || Solaris || Windows || Plan9)
+		mp->g0 = runtime·malg(-1);
+	else
+		mp->g0 = runtime·malg(8192);
+	mp->g0->m = mp;
+
+	if(p == g->m->p)
+		releasep();
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+		g->stackguard0 = StackPreempt;
+
+	return mp;
+}
+
+G *runtime·newG(void); // in proc.go
+
+static G*
+allocg(void)
+{
+	return runtime·newG();
+}
+
+static M* lockextra(bool nilokay);
+static void unlockextra(M*);
+
+// needm is called when a cgo callback happens on a
+// thread without an m (a thread not created by Go).
+// In this case, needm is expected to find an m to use
+// and return with m, g initialized correctly.
+// Since m and g are not set now (likely nil, but see below)
+// needm is limited in what routines it can call. In particular
+// it can only call nosplit functions (textflag 7) and cannot
+// do any scheduling that requires an m.
+//
+// In order to avoid needing heavy lifting here, we adopt
+// the following strategy: there is a stack of available m's
+// that can be stolen. Using compare-and-swap
+// to pop from the stack has ABA races, so we simulate
+// a lock by doing an exchange (via casp) to steal the stack
+// head and replace the top pointer with MLOCKED (1).
+// This serves as a simple spin lock that we can use even
+// without an m. The thread that locks the stack in this way
+// unlocks the stack by storing a valid stack head pointer.
+//
+// In order to make sure that there is always an m structure
+// available to be stolen, we maintain the invariant that there
+// is always one more than needed. At the beginning of the
+// program (if cgo is in use) the list is seeded with a single m.
+// If needm finds that it has taken the last m off the list, its job
+// is - once it has installed its own m so that it can do things like
+// allocate memory - to create a spare m and put it on the list.
+//
+// Each of these extra m's also has a g0 and a curg that are
+// pressed into service as the scheduling stack and current
+// goroutine for the duration of the cgo callback.
+//
+// When the callback is done with the m, it calls dropm to
+// put the m back on the list.
+#pragma textflag NOSPLIT
+void
+runtime·needm(byte x)
+{
+	M *mp;
+
+	if(runtime·needextram) {
+		// Can happen if C/C++ code calls Go from a global ctor.
+		// Can not throw, because scheduler is not initialized yet.
+		runtime·write(2, "fatal error: cgo callback before cgo call\n",
+			sizeof("fatal error: cgo callback before cgo call\n")-1);
+		runtime·exit(1);
+	}
+
+	// Lock extra list, take head, unlock popped list.
+	// nilokay=false is safe here because of the invariant above,
+	// that the extra list always contains or will soon contain
+	// at least one m.
+	mp = lockextra(false);
+
+	// Set needextram when we've just emptied the list,
+	// so that the eventual call into cgocallbackg will
+	// allocate a new m for the extra list. We delay the
+	// allocation until then so that it can be done
+	// after exitsyscall makes sure it is okay to be
+	// running at all (that is, there's no garbage collection
+	// running right now).
+	mp->needextram = mp->schedlink == nil;
+	unlockextra(mp->schedlink);
+
+	// Install g (= m->g0) and set the stack bounds
+	// to match the current stack. We don't actually know
+	// how big the stack is, like we don't know how big any
+	// scheduling stack is, but we assume there's at least 32 kB,
+	// which is more than enough for us.
+	runtime·setg(mp->g0);
+	g->stack.hi = (uintptr)(&x + 1024);
+	g->stack.lo = (uintptr)(&x - 32*1024);
+	g->stackguard0 = g->stack.lo + StackGuard;
+
+	// Initialize this thread to use the m.
+	runtime·asminit();
+	runtime·minit();
+}
+
+// newextram allocates an m and puts it on the extra list.
+// It is called with a working local m, so that it can do things
+// like call schedlock and allocate.
+void
+runtime·newextram(void)
+{
+	M *mp, *mnext;
+	G *gp;
+
+	// Create extra goroutine locked to extra m.
+	// The goroutine is the context in which the cgo callback will run.
+	// The sched.pc will never be returned to, but setting it to
+	// runtime.goexit makes clear to the traceback routines where
+	// the goroutine stack ends.
+	mp = runtime·allocm(nil);
+	gp = runtime·malg(4096);
+	gp->sched.pc = (uintptr)runtime·goexit + PCQuantum;
+	gp->sched.sp = gp->stack.hi;
+	gp->sched.sp -= 4*sizeof(uintreg); // extra space in case of reads slightly beyond frame
+	gp->sched.lr = 0;
+	gp->sched.g = gp;
+	gp->syscallpc = gp->sched.pc;
+	gp->syscallsp = gp->sched.sp;
+	// malg returns status as Gidle, change to Gsyscall before adding to allg
+	// where GC will see it.
+	runtime·casgstatus(gp, Gidle, Gsyscall);
+	gp->m = mp;
+	mp->curg = gp;
+	mp->locked = LockInternal;
+	mp->lockedg = gp;
+	gp->lockedm = mp;
+	gp->goid = runtime·xadd64(&runtime·sched.goidgen, 1);
+	if(raceenabled)
+		gp->racectx = runtime·racegostart(runtime·newextram);
+	// put on allg for garbage collector
+	runtime·allgadd(gp);
+
+	// Add m to the extra list.
+	mnext = lockextra(true);
+	mp->schedlink = mnext;
+	unlockextra(mp);
+}
+
+// dropm is called when a cgo callback has called needm but is now
+// done with the callback and returning back into the non-Go thread.
+// It puts the current m back onto the extra list.
+//
+// The main expense here is the call to signalstack to release the
+// m's signal stack, and then the call to needm on the next callback
+// from this thread. It is tempting to try to save the m for next time,
+// which would eliminate both these costs, but there might not be
+// a next time: the current thread (which Go does not control) might exit.
+// If we saved the m for that thread, there would be an m leak each time
+// such a thread exited. Instead, we acquire and release an m on each
+// call. These should typically not be scheduling operations, just a few
+// atomics, so the cost should be small.
+//
+// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+// variable using pthread_key_create. Unlike the pthread keys we already use
+// on OS X, this dummy key would never be read by Go code. It would exist
+// only so that we could register at thread-exit-time destructor.
+// That destructor would put the m back onto the extra list.
+// This is purely a performance optimization. The current version,
+// in which dropm happens on each cgo call, is still correct too.
+// We may have to keep the current version on systems with cgo
+// but without pthreads, like Windows.
+void
+runtime·dropm(void)
+{
+	M *mp, *mnext;
+
+	// Undo whatever initialization minit did during needm.
+	runtime·unminit();
+
+	// Clear m and g, and return m to the extra list.
+	// After the call to setmg we can only call nosplit functions.
+	mp = g->m;
+	runtime·setg(nil);
+
+	mnext = lockextra(true);
+	mp->schedlink = mnext;
+	unlockextra(mp);
+}
+
+#define MLOCKED ((M*)1)
+
+// lockextra locks the extra list and returns the list head.
+// The caller must unlock the list by storing a new list head
+// to runtime.extram. If nilokay is true, then lockextra will
+// return a nil list head if that's what it finds. If nilokay is false,
+// lockextra will keep waiting until the list head is no longer nil.
+#pragma textflag NOSPLIT
+static M*
+lockextra(bool nilokay)
+{
+	M *mp;
+	void (*yield)(void);
+
+	for(;;) {
+		mp = runtime·atomicloadp(&runtime·extram);
+		if(mp == MLOCKED) {
+			yield = runtime·osyield;
+			yield();
+			continue;
+		}
+		if(mp == nil && !nilokay) {
+			runtime·usleep(1);
+			continue;
+		}
+		if(!runtime·casp(&runtime·extram, mp, MLOCKED)) {
+			yield = runtime·osyield;
+			yield();
+			continue;
+		}
+		break;
+	}
+	return mp;
+}
+
+#pragma textflag NOSPLIT
+static void
+unlockextra(M *mp)
+{
+	runtime·atomicstorep(&runtime·extram, mp);
+}
+
+
+// Create a new m.  It will start off with a call to fn, or else the scheduler.
+static void
+newm(void(*fn)(void), P *p)
+{
+	M *mp;
+
+	mp = runtime·allocm(p);
+	mp->nextp = p;
+	mp->mstartfn = fn;
+
+	if(runtime·iscgo) {
+		CgoThreadStart ts;
+
+		if(_cgo_thread_start == nil)
+			runtime·throw("_cgo_thread_start missing");
+		ts.g = mp->g0;
+		ts.tls = mp->tls;
+		ts.fn = runtime·mstart;
+		runtime·asmcgocall(_cgo_thread_start, &ts);
+		return;
+	}
+	runtime·newosproc(mp, (byte*)mp->g0->stack.hi);
+}
+
+// Stops execution of the current m until new work is available.
+// Returns with acquired P.
+static void
+stopm(void)
+{
+	if(g->m->locks)
+		runtime·throw("stopm holding locks");
+	if(g->m->p)
+		runtime·throw("stopm holding p");
+	if(g->m->spinning) {
+		g->m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
+
+retry:
+	runtime·lock(&runtime·sched.lock);
+	mput(g->m);
+	runtime·unlock(&runtime·sched.lock);
+	runtime·notesleep(&g->m->park);
+	runtime·noteclear(&g->m->park);
+	if(g->m->helpgc) {
+		runtime·gchelper();
+		g->m->helpgc = 0;
+		g->m->mcache = nil;
+		goto retry;
+	}
+	acquirep(g->m->nextp);
+	g->m->nextp = nil;
+}
+
+static void
+mspinning(void)
+{
+	g->m->spinning = true;
+}
+
+// Schedules some M to run the p (creates an M if necessary).
+// If p==nil, tries to get an idle P, if no idle P's does nothing.
+static void
+startm(P *p, bool spinning)
+{
+	M *mp;
+	void (*fn)(void);
+
+	runtime·lock(&runtime·sched.lock);
+	if(p == nil) {
+		p = pidleget();
+		if(p == nil) {
+			runtime·unlock(&runtime·sched.lock);
+			if(spinning)
+				runtime·xadd(&runtime·sched.nmspinning, -1);
+			return;
+		}
+	}
+	mp = mget();
+	runtime·unlock(&runtime·sched.lock);
+	if(mp == nil) {
+		fn = nil;
+		if(spinning)
+			fn = mspinning;
+		newm(fn, p);
+		return;
+	}
+	if(mp->spinning)
+		runtime·throw("startm: m is spinning");
+	if(mp->nextp)
+		runtime·throw("startm: m has p");
+	mp->spinning = spinning;
+	mp->nextp = p;
+	runtime·notewakeup(&mp->park);
+}
+
+// Hands off P from syscall or locked M.
+static void
+handoffp(P *p)
+{
+	// if it has local work, start it straight away
+	if(p->runqhead != p->runqtail || runtime·sched.runqsize) {
+		startm(p, false);
+		return;
+	}
+	// no local work, check that there are no spinning/idle M's,
+	// otherwise our help is not required
+	if(runtime·atomicload(&runtime·sched.nmspinning) + runtime·atomicload(&runtime·sched.npidle) == 0 &&  // TODO: fast atomic
+		runtime·cas(&runtime·sched.nmspinning, 0, 1)){
+		startm(p, true);
+		return;
+	}
+	runtime·lock(&runtime·sched.lock);
+	if(runtime·sched.gcwaiting) {
+		p->status = Pgcstop;
+		if(--runtime·sched.stopwait == 0)
+			runtime·notewakeup(&runtime·sched.stopnote);
+		runtime·unlock(&runtime·sched.lock);
+		return;
+	}
+	if(runtime·sched.runqsize) {
+		runtime·unlock(&runtime·sched.lock);
+		startm(p, false);
+		return;
+	}
+	// If this is the last running P and nobody is polling network,
+	// need to wakeup another M to poll network.
+	if(runtime·sched.npidle == runtime·gomaxprocs-1 && runtime·atomicload64(&runtime·sched.lastpoll) != 0) {
+		runtime·unlock(&runtime·sched.lock);
+		startm(p, false);
+		return;
+	}
+	pidleput(p);
+	runtime·unlock(&runtime·sched.lock);
+}
+
+// Tries to add one more P to execute G's.
+// Called when a G is made runnable (newproc, ready).
+static void
+wakep(void)
+{
+	// be conservative about spinning threads
+	if(!runtime·cas(&runtime·sched.nmspinning, 0, 1))
+		return;
+	startm(nil, true);
+}
+
+// Stops execution of the current m that is locked to a g until the g is runnable again.
+// Returns with acquired P.
+static void
+stoplockedm(void)
+{
+	P *p;
+	uint32 status;
+
+	if(g->m->lockedg == nil || g->m->lockedg->lockedm != g->m)
+		runtime·throw("stoplockedm: inconsistent locking");
+	if(g->m->p) {
+		// Schedule another M to run this p.
+		p = releasep();
+		handoffp(p);
+	}
+	incidlelocked(1);
+	// Wait until another thread schedules lockedg again.
+	runtime·notesleep(&g->m->park);
+	runtime·noteclear(&g->m->park);
+	status = runtime·readgstatus(g->m->lockedg);
+	if((status&~Gscan) != Grunnable){
+		runtime·printf("runtime:stoplockedm: g is not Grunnable or Gscanrunnable");
+		dumpgstatus(g);
+		runtime·throw("stoplockedm: not runnable");
+	}
+	acquirep(g->m->nextp);
+	g->m->nextp = nil;
+}
+
+// Schedules the locked m to run the locked gp.
+static void
+startlockedm(G *gp)
+{
+	M *mp;
+	P *p;
+
+	mp = gp->lockedm;
+	if(mp == g->m)
+		runtime·throw("startlockedm: locked to me");
+	if(mp->nextp)
+		runtime·throw("startlockedm: m has p");
+	// directly handoff current P to the locked m
+	incidlelocked(-1);
+	p = releasep();
+	mp->nextp = p;
+	runtime·notewakeup(&mp->park);
+	stopm();
+}
+
+// Stops the current m for stoptheworld.
+// Returns when the world is restarted.
+static void
+gcstopm(void)
+{
+	P *p;
+
+	if(!runtime·sched.gcwaiting)
+		runtime·throw("gcstopm: not waiting for gc");
+	if(g->m->spinning) {
+		g->m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
+	p = releasep();
+	runtime·lock(&runtime·sched.lock);
+	p->status = Pgcstop;
+	if(--runtime·sched.stopwait == 0)
+		runtime·notewakeup(&runtime·sched.stopnote);
+	runtime·unlock(&runtime·sched.lock);
+	stopm();
+}
+
+// Schedules gp to run on the current M.
+// Never returns.
+static void
+execute(G *gp)
+{
+	int32 hz;
+	
+	runtime·casgstatus(gp, Grunnable, Grunning);
+	gp->waitsince = 0;
+	gp->preempt = false;
+	gp->stackguard0 = gp->stack.lo + StackGuard;
+	g->m->p->schedtick++;
+	g->m->curg = gp;
+	gp->m = g->m;
+
+	// Check whether the profiler needs to be turned on or off.
+	hz = runtime·sched.profilehz;
+	if(g->m->profilehz != hz)
+		runtime·resetcpuprofiler(hz);
+
+	runtime·gogo(&gp->sched);
+}
+
+// Finds a runnable goroutine to execute.
+// Tries to steal from other P's, get g from global queue, poll network.
+static G*
+findrunnable(void)
+{
+	G *gp;
+	P *p;
+	int32 i;
+
+top:
+	if(runtime·sched.gcwaiting) {
+		gcstopm();
+		goto top;
+	}
+	if(runtime·fingwait && runtime·fingwake && (gp = runtime·wakefing()) != nil)
+		runtime·ready(gp);
+	// local runq
+	gp = runqget(g->m->p);
+	if(gp)
+		return gp;
+	// global runq
+	if(runtime·sched.runqsize) {
+		runtime·lock(&runtime·sched.lock);
+		gp = globrunqget(g->m->p, 0);
+		runtime·unlock(&runtime·sched.lock);
+		if(gp)
+			return gp;
+	}
+	// poll network
+	gp = runtime·netpoll(false);  // non-blocking
+	if(gp) {
+		injectglist(gp->schedlink);
+		runtime·casgstatus(gp, Gwaiting, Grunnable);
+		return gp;
+	}
+	// If number of spinning M's >= number of busy P's, block.
+	// This is necessary to prevent excessive CPU consumption
+	// when GOMAXPROCS>>1 but the program parallelism is low.
+	if(!g->m->spinning && 2 * runtime·atomicload(&runtime·sched.nmspinning) >= runtime·gomaxprocs - runtime·atomicload(&runtime·sched.npidle))  // TODO: fast atomic
+		goto stop;
+	if(!g->m->spinning) {
+		g->m->spinning = true;
+		runtime·xadd(&runtime·sched.nmspinning, 1);
+	}
+	// random steal from other P's
+	for(i = 0; i < 2*runtime·gomaxprocs; i++) {
+		if(runtime·sched.gcwaiting)
+			goto top;
+		p = runtime·allp[runtime·fastrand1()%runtime·gomaxprocs];
+		if(p == g->m->p)
+			gp = runqget(p);
+		else
+			gp = runqsteal(g->m->p, p);
+		if(gp)
+			return gp;
+	}
+stop:
+	// return P and block
+	runtime·lock(&runtime·sched.lock);
+	if(runtime·sched.gcwaiting) {
+		runtime·unlock(&runtime·sched.lock);
+		goto top;
+	}
+	if(runtime·sched.runqsize) {
+		gp = globrunqget(g->m->p, 0);
+		runtime·unlock(&runtime·sched.lock);
+		return gp;
+	}
+	p = releasep();
+	pidleput(p);
+	runtime·unlock(&runtime·sched.lock);
+	if(g->m->spinning) {
+		g->m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
+	// check all runqueues once again
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p && p->runqhead != p->runqtail) {
+			runtime·lock(&runtime·sched.lock);
+			p = pidleget();
+			runtime·unlock(&runtime·sched.lock);
+			if(p) {
+				acquirep(p);
+				goto top;
+			}
+			break;
+		}
+	}
+	// poll network
+	if(runtime·xchg64(&runtime·sched.lastpoll, 0) != 0) {
+		if(g->m->p)
+			runtime·throw("findrunnable: netpoll with p");
+		if(g->m->spinning)
+			runtime·throw("findrunnable: netpoll with spinning");
+		gp = runtime·netpoll(true);  // block until new work is available
+		runtime·atomicstore64(&runtime·sched.lastpoll, runtime·nanotime());
+		if(gp) {
+			runtime·lock(&runtime·sched.lock);
+			p = pidleget();
+			runtime·unlock(&runtime·sched.lock);
+			if(p) {
+				acquirep(p);
+				injectglist(gp->schedlink);
+				runtime·casgstatus(gp, Gwaiting, Grunnable);
+				return gp;
+			}
+			injectglist(gp);
+		}
+	}
+	stopm();
+	goto top;
+}
+
+static void
+resetspinning(void)
+{
+	int32 nmspinning;
+
+	if(g->m->spinning) {
+		g->m->spinning = false;
+		nmspinning = runtime·xadd(&runtime·sched.nmspinning, -1);
+		if(nmspinning < 0)
+			runtime·throw("findrunnable: negative nmspinning");
+	} else
+		nmspinning = runtime·atomicload(&runtime·sched.nmspinning);
+
+	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+	// so see if we need to wakeup another P here.
+	if (nmspinning == 0 && runtime·atomicload(&runtime·sched.npidle) > 0)
+		wakep();
+}
+
+// Injects the list of runnable G's into the scheduler.
+// Can run concurrently with GC.
+static void
+injectglist(G *glist)
+{
+	int32 n;
+	G *gp;
+
+	if(glist == nil)
+		return;
+	runtime·lock(&runtime·sched.lock);
+	for(n = 0; glist; n++) {
+		gp = glist;
+		glist = gp->schedlink;
+		runtime·casgstatus(gp, Gwaiting, Grunnable); 
+		globrunqput(gp);
+	}
+	runtime·unlock(&runtime·sched.lock);
+
+	for(; n && runtime·sched.npidle; n--)
+		startm(nil, false);
+}
+
+// One round of scheduler: find a runnable goroutine and execute it.
+// Never returns.
+static void
+schedule(void)
+{
+	G *gp;
+	uint32 tick;
+
+	if(g->m->locks)
+		runtime·throw("schedule: holding locks");
+
+	if(g->m->lockedg) {
+		stoplockedm();
+		execute(g->m->lockedg);  // Never returns.
+	}
+
+top:
+	if(runtime·sched.gcwaiting) {
+		gcstopm();
+		goto top;
+	}
+
+	gp = nil;
+	// Check the global runnable queue once in a while to ensure fairness.
+	// Otherwise two goroutines can completely occupy the local runqueue
+	// by constantly respawning each other.
+	tick = g->m->p->schedtick;
+	// This is a fancy way to say tick%61==0,
+	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
+	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime·sched.runqsize > 0) {
+		runtime·lock(&runtime·sched.lock);
+		gp = globrunqget(g->m->p, 1);
+		runtime·unlock(&runtime·sched.lock);
+		if(gp)
+			resetspinning();
+	}
+	if(gp == nil) {
+		gp = runqget(g->m->p);
+		if(gp && g->m->spinning)
+			runtime·throw("schedule: spinning with local work");
+	}
+	if(gp == nil) {
+		gp = findrunnable();  // blocks until work is available
+		resetspinning();
+	}
+
+	if(gp->lockedm) {
+		// Hands off own p to the locked m,
+		// then blocks waiting for a new p.
+		startlockedm(gp);
+		goto top;
+	}
+
+	execute(gp);
+}
+
+// dropg removes the association between m and the current goroutine m->curg (gp for short).
+// Typically a caller sets gp's status away from Grunning and then
+// immediately calls dropg to finish the job. The caller is also responsible
+// for arranging that gp will be restarted using runtime·ready at an
+// appropriate time. After calling dropg and arranging for gp to be
+// readied later, the caller can do other work but eventually should
+// call schedule to restart the scheduling of goroutines on this m.
+static void
+dropg(void)
+{
+	if(g->m->lockedg == nil) {
+		g->m->curg->m = nil;
+		g->m->curg = nil;
+	}
+}
+
+// Puts the current goroutine into a waiting state and calls unlockf.
+// If unlockf returns false, the goroutine is resumed.
+void
+runtime·park(bool(*unlockf)(G*, void*), void *lock, String reason)
+{
+	void (*fn)(G*);
+
+	g->m->waitlock = lock;
+	g->m->waitunlockf = unlockf;
+	g->waitreason = reason;
+	fn = runtime·park_m;
+	runtime·mcall(&fn);
+}
+
+bool
+runtime·parkunlock_c(G *gp, void *lock)
+{
+	USED(gp);
+	runtime·unlock(lock);
+	return true;
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling runtime·ready(gp).
+void
+runtime·parkunlock(Mutex *lock, String reason)
+{
+	runtime·park(runtime·parkunlock_c, lock, reason);
+}
+
+// runtime·park continuation on g0.
+void
+runtime·park_m(G *gp)
+{
+	bool ok;
+
+	runtime·casgstatus(gp, Grunning, Gwaiting);
+	dropg();
+
+	if(g->m->waitunlockf) {
+		ok = g->m->waitunlockf(gp, g->m->waitlock);
+		g->m->waitunlockf = nil;
+		g->m->waitlock = nil;
+		if(!ok) {
+			runtime·casgstatus(gp, Gwaiting, Grunnable); 
+			execute(gp);  // Schedule it back, never returns.
+		}
+	}
+
+	schedule();
+}
+
+// Gosched continuation on g0.
+void
+runtime·gosched_m(G *gp)
+{
+	uint32 status;
+
+	status = runtime·readgstatus(gp);
+	if((status&~Gscan) != Grunning){
+		dumpgstatus(gp);
+		runtime·throw("bad g status");
+	}
+	runtime·casgstatus(gp, Grunning, Grunnable);
+	dropg();
+	runtime·lock(&runtime·sched.lock);
+	globrunqput(gp);
+	runtime·unlock(&runtime·sched.lock);
+
+	schedule();
+}
+
+// Finishes execution of the current goroutine.
+// Must be NOSPLIT because it is called from Go.
+#pragma textflag NOSPLIT
+void
+runtime·goexit1(void)
+{
+	void (*fn)(G*);
+
+	if(raceenabled)
+		runtime·racegoend();
+	fn = goexit0;
+	runtime·mcall(&fn);
+}
+
+// runtime·goexit continuation on g0.
+static void
+goexit0(G *gp)
+{
+	runtime·casgstatus(gp, Grunning, Gdead);
+	gp->m = nil;
+	gp->lockedm = nil;
+	g->m->lockedg = nil;
+	gp->paniconfault = 0;
+	gp->defer = nil; // should be true already but just in case.
+	gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
+	gp->writebuf.array = nil;
+	gp->writebuf.len = 0;
+	gp->writebuf.cap = 0;
+	gp->waitreason.str = nil;
+	gp->waitreason.len = 0;
+	gp->param = nil;
+
+	dropg();
+
+	if(g->m->locked & ~LockExternal) {
+		runtime·printf("invalid m->locked = %d\n", g->m->locked);
+		runtime·throw("internal lockOSThread error");
+	}	
+	g->m->locked = 0;
+	gfput(g->m->p, gp);
+	schedule();
+}
+
+#pragma textflag NOSPLIT
+static void
+save(uintptr pc, uintptr sp)
+{
+	g->sched.pc = pc;
+	g->sched.sp = sp;
+	g->sched.lr = 0;
+	g->sched.ret = 0;
+	g->sched.ctxt = 0;
+	g->sched.g = g;
+}
+
+static void entersyscall_bad(void);
+static void entersyscall_sysmon(void);
+static void entersyscall_gcwait(void);
+
+// The goroutine g is about to enter a system call.
+// Record that it's not using the cpu anymore.
+// This is called only from the go syscall library and cgocall,
+// not from the low-level system calls used by the runtime.
+//
+// Entersyscall cannot split the stack: the runtime·gosave must
+// make g->sched refer to the caller's stack segment, because
+// entersyscall is going to return immediately after.
+//
+// Nothing entersyscall calls can split the stack either.
+// We cannot safely move the stack during an active call to syscall,
+// because we do not know which of the uintptr arguments are
+// really pointers (back into the stack).
+// In practice, this means that we make the fast path run through
+// entersyscall doing no-split things, and the slow path has to use onM
+// to run bigger things on the m stack.
+//
+// reentersyscall is the entry point used by cgo callbacks, where explicitly
+// saved SP and PC are restored. This is needed when exitsyscall will be called
+// from a function further up in the call stack than the parent, as g->syscallsp
+// must always point to a valid stack frame. entersyscall below is the normal
+// entry point for syscalls, which obtains the SP and PC from the caller.
+#pragma textflag NOSPLIT
+void
+runtime·reentersyscall(uintptr pc, uintptr sp)
+{
+	void (*fn)(void);
+
+	// Disable preemption because during this function g is in Gsyscall status,
+	// but can have inconsistent g->sched, do not let GC observe it.
+	g->m->locks++;
+	
+	// Entersyscall must not call any function that might split/grow the stack.
+	// (See details in comment above.)
+	// Catch calls that might, by replacing the stack guard with something that
+	// will trip any stack check and leaving a flag to tell newstack to die.
+	g->stackguard0 = StackPreempt;
+	g->throwsplit = 1;
+
+	// Leave SP around for GC and traceback.
+	save(pc, sp);
+	g->syscallsp = sp;
+	g->syscallpc = pc;
+	runtime·casgstatus(g, Grunning, Gsyscall);
+	if(g->syscallsp < g->stack.lo || g->stack.hi < g->syscallsp) {
+		fn = entersyscall_bad;
+		runtime·onM(&fn);
+	}
+
+	if(runtime·atomicload(&runtime·sched.sysmonwait)) {  // TODO: fast atomic
+		fn = entersyscall_sysmon;
+		runtime·onM(&fn);
+		save(pc, sp);
+	}
+
+	g->m->mcache = nil;
+	g->m->p->m = nil;
+	runtime·atomicstore(&g->m->p->status, Psyscall);
+	if(runtime·sched.gcwaiting) {
+		fn = entersyscall_gcwait;
+		runtime·onM(&fn);
+		save(pc, sp);
+	}
+
+	// Goroutines must not split stacks in Gsyscall status (it would corrupt g->sched).
+	// We set stackguard to StackPreempt so that first split stack check calls morestack.
+	// Morestack detects this case and throws.
+	g->stackguard0 = StackPreempt;
+	g->m->locks--;
+}
+
+// Standard syscall entry used by the go syscall library and normal cgo calls.
+#pragma textflag NOSPLIT
+void
+·entersyscall(int32 dummy)
+{
+	runtime·reentersyscall((uintptr)runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
+}
+
+static void
+entersyscall_bad(void)
+{
+	G *gp;
+	
+	gp = g->m->curg;
+	runtime·printf("entersyscall inconsistent %p [%p,%p]\n",
+		gp->syscallsp, gp->stack.lo, gp->stack.hi);
+	runtime·throw("entersyscall");
+}
+
+static void
+entersyscall_sysmon(void)
+{
+	runtime·lock(&runtime·sched.lock);
+	if(runtime·atomicload(&runtime·sched.sysmonwait)) {
+		runtime·atomicstore(&runtime·sched.sysmonwait, 0);
+		runtime·notewakeup(&runtime·sched.sysmonnote);
+	}
+	runtime·unlock(&runtime·sched.lock);
+}
+
+static void
+entersyscall_gcwait(void)
+{
+	runtime·lock(&runtime·sched.lock);
+	if (runtime·sched.stopwait > 0 && runtime·cas(&g->m->p->status, Psyscall, Pgcstop)) {
+		if(--runtime·sched.stopwait == 0)
+			runtime·notewakeup(&runtime·sched.stopnote);
+	}
+	runtime·unlock(&runtime·sched.lock);
+}
+
+static void entersyscallblock_handoff(void);
+
+// The same as runtime·entersyscall(), but with a hint that the syscall is blocking.
+#pragma textflag NOSPLIT
+void
+·entersyscallblock(int32 dummy)
+{
+	void (*fn)(void);
+
+	g->m->locks++;  // see comment in entersyscall
+	g->throwsplit = 1;
+	g->stackguard0 = StackPreempt;  // see comment in entersyscall
+
+	// Leave SP around for GC and traceback.
+	save((uintptr)runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
+	g->syscallsp = g->sched.sp;
+	g->syscallpc = g->sched.pc;
+	runtime·casgstatus(g, Grunning, Gsyscall);
+	if(g->syscallsp < g->stack.lo || g->stack.hi < g->syscallsp) {
+		fn = entersyscall_bad;
+		runtime·onM(&fn);
+	}
+	
+	fn = entersyscallblock_handoff;
+	runtime·onM(&fn);
+
+	// Resave for traceback during blocked call.
+	save((uintptr)runtime·getcallerpc(&dummy), runtime·getcallersp(&dummy));
+
+	g->m->locks--;
+}
+
+static void
+entersyscallblock_handoff(void)
+{
+	handoffp(releasep());
+}
+
+// The goroutine g exited its system call.
+// Arrange for it to run on a cpu again.
+// This is called only from the go syscall library, not
+// from the low-level system calls used by the runtime.
+#pragma textflag NOSPLIT
+void
+·exitsyscall(int32 dummy)
+{
+	void (*fn)(G*);
+
+	g->m->locks++;  // see comment in entersyscall
+
+	if(runtime·getcallersp(&dummy) > g->syscallsp)
+		runtime·throw("exitsyscall: syscall frame is no longer valid");
+
+	g->waitsince = 0;
+	if(exitsyscallfast()) {
+		// There's a cpu for us, so we can run.
+		g->m->p->syscalltick++;
+		// We need to cas the status and scan before resuming...
+		runtime·casgstatus(g, Gsyscall, Grunning);
+
+		// Garbage collector isn't running (since we are),
+		// so okay to clear syscallsp.
+		g->syscallsp = (uintptr)nil;
+		g->m->locks--;
+		if(g->preempt) {
+			// restore the preemption request in case we've cleared it in newstack
+			g->stackguard0 = StackPreempt;
+		} else {
+			// otherwise restore the real stackguard, we've spoiled it in entersyscall/entersyscallblock
+			g->stackguard0 = g->stack.lo + StackGuard;
+		}
+		g->throwsplit = 0;
+		return;
+	}
+
+	g->m->locks--;
+
+	// Call the scheduler.
+	fn = exitsyscall0;
+	runtime·mcall(&fn);
+
+	// Scheduler returned, so we're allowed to run now.
+	// Delete the syscallsp information that we left for
+	// the garbage collector during the system call.
+	// Must wait until now because until gosched returns
+	// we don't know for sure that the garbage collector
+	// is not running.
+	g->syscallsp = (uintptr)nil;
+	g->m->p->syscalltick++;
+	g->throwsplit = 0;
+}
+
+static void exitsyscallfast_pidle(void);
+
+#pragma textflag NOSPLIT
+static bool
+exitsyscallfast(void)
+{
+	void (*fn)(void);
+
+	// Freezetheworld sets stopwait but does not retake P's.
+	if(runtime·sched.stopwait) {
+		g->m->p = nil;
+		return false;
+	}
+
+	// Try to re-acquire the last P.
+	if(g->m->p && g->m->p->status == Psyscall && runtime·cas(&g->m->p->status, Psyscall, Prunning)) {
+		// There's a cpu for us, so we can run.
+		g->m->mcache = g->m->p->mcache;
+		g->m->p->m = g->m;
+		return true;
+	}
+	// Try to get any other idle P.
+	g->m->p = nil;
+	if(runtime·sched.pidle) {
+		fn = exitsyscallfast_pidle;
+		runtime·onM(&fn);
+		if(g->m->scalararg[0]) {
+			g->m->scalararg[0] = 0;
+			return true;
+		}
+	}
+	return false;
+}
+
+static void
+exitsyscallfast_pidle(void)
+{
+	P *p;
+
+	runtime·lock(&runtime·sched.lock);
+	p = pidleget();
+	if(p && runtime·atomicload(&runtime·sched.sysmonwait)) {
+		runtime·atomicstore(&runtime·sched.sysmonwait, 0);
+		runtime·notewakeup(&runtime·sched.sysmonnote);
+	}
+	runtime·unlock(&runtime·sched.lock);
+	if(p) {
+		acquirep(p);
+		g->m->scalararg[0] = 1;
+	} else
+		g->m->scalararg[0] = 0;
+}
+
+// runtime·exitsyscall slow path on g0.
+// Failed to acquire P, enqueue gp as runnable.
+static void
+exitsyscall0(G *gp)
+{
+	P *p;
+
+	runtime·casgstatus(gp, Gsyscall, Grunnable);
+	dropg();
+	runtime·lock(&runtime·sched.lock);
+	p = pidleget();
+	if(p == nil)
+		globrunqput(gp);
+	else if(runtime·atomicload(&runtime·sched.sysmonwait)) {
+		runtime·atomicstore(&runtime·sched.sysmonwait, 0);
+		runtime·notewakeup(&runtime·sched.sysmonnote);
+	}
+	runtime·unlock(&runtime·sched.lock);
+	if(p) {
+		acquirep(p);
+		execute(gp);  // Never returns.
+	}
+	if(g->m->lockedg) {
+		// Wait until another thread schedules gp and so m again.
+		stoplockedm();
+		execute(gp);  // Never returns.
+	}
+	stopm();
+	schedule();  // Never returns.
+}
+
+static void
+beforefork(void)
+{
+	G *gp;
+	
+	gp = g->m->curg;
+	// Fork can hang if preempted with signals frequently enough (see issue 5517).
+	// Ensure that we stay on the same M where we disable profiling.
+	gp->m->locks++;
+	if(gp->m->profilehz != 0)
+		runtime·resetcpuprofiler(0);
+
+	// This function is called before fork in syscall package.
+	// Code between fork and exec must not allocate memory nor even try to grow stack.
+	// Here we spoil g->stackguard to reliably detect any attempts to grow stack.
+	// runtime_AfterFork will undo this in parent process, but not in child.
+	gp->stackguard0 = StackFork;
+}
+
+// Called from syscall package before fork.
+#pragma textflag NOSPLIT
+void
+syscall·runtime_BeforeFork(void)
+{
+	void (*fn)(void);
+	
+	fn = beforefork;
+	runtime·onM(&fn);
+}
+
+static void
+afterfork(void)
+{
+	int32 hz;
+	G *gp;
+	
+	gp = g->m->curg;
+	// See the comment in runtime_BeforeFork.
+	gp->stackguard0 = gp->stack.lo + StackGuard;
+
+	hz = runtime·sched.profilehz;
+	if(hz != 0)
+		runtime·resetcpuprofiler(hz);
+	gp->m->locks--;
+}
+
+// Called from syscall package after fork in parent.
+#pragma textflag NOSPLIT
+void
+syscall·runtime_AfterFork(void)
+{
+	void (*fn)(void);
+	
+	fn = afterfork;
+	runtime·onM(&fn);
+}
+
+// Hook used by runtime·malg to call runtime·stackalloc on the
+// scheduler stack.  This exists because runtime·stackalloc insists
+// on being called on the scheduler stack, to avoid trying to grow
+// the stack while allocating a new stack segment.
+static void
+mstackalloc(G *gp)
+{
+	G *newg;
+	uintptr size;
+
+	newg = g->m->ptrarg[0];
+	size = g->m->scalararg[0];
+
+	newg->stack = runtime·stackalloc(size);
+
+	runtime·gogo(&gp->sched);
+}
+
+// Allocate a new g, with a stack big enough for stacksize bytes.
+G*
+runtime·malg(int32 stacksize)
+{
+	G *newg;
+	void (*fn)(G*);
+
+	newg = allocg();
+	if(stacksize >= 0) {
+		stacksize = runtime·round2(StackSystem + stacksize);
+		if(g == g->m->g0) {
+			// running on scheduler stack already.
+			newg->stack = runtime·stackalloc(stacksize);
+		} else {
+			// have to call stackalloc on scheduler stack.
+			g->m->scalararg[0] = stacksize;
+			g->m->ptrarg[0] = newg;
+			fn = mstackalloc;
+			runtime·mcall(&fn);
+			g->m->ptrarg[0] = nil;
+		}
+		newg->stackguard0 = newg->stack.lo + StackGuard;
+		newg->stackguard1 = ~(uintptr)0;
+	}
+	return newg;
+}
+
+static void
+newproc_m(void)
+{
+	byte *argp;
+	void *callerpc;
+	FuncVal *fn;
+	int32 siz;
+
+	siz = g->m->scalararg[0];
+	callerpc = (void*)g->m->scalararg[1];	
+	argp = g->m->ptrarg[0];
+	fn = (FuncVal*)g->m->ptrarg[1];
+
+	runtime·newproc1(fn, argp, siz, 0, callerpc);
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+}
+
+// Create a new g running fn with siz bytes of arguments.
+// Put it on the queue of g's waiting to run.
+// The compiler turns a go statement into a call to this.
+// Cannot split the stack because it assumes that the arguments
+// are available sequentially after &fn; they would not be
+// copied if a stack split occurred.
+#pragma textflag NOSPLIT
+void
+runtime·newproc(int32 siz, FuncVal* fn, ...)
+{
+	byte *argp;
+	void (*mfn)(void);
+
+	if(thechar == '5')
+		argp = (byte*)(&fn+2);  // skip caller's saved LR
+	else
+		argp = (byte*)(&fn+1);
+
+	g->m->locks++;
+	g->m->scalararg[0] = siz;
+	g->m->scalararg[1] = (uintptr)runtime·getcallerpc(&siz);
+	g->m->ptrarg[0] = argp;
+	g->m->ptrarg[1] = fn;
+	mfn = newproc_m;
+	runtime·onM(&mfn);
+	g->m->locks--;
+}
+
+void runtime·main(void);
+
+// Create a new g running fn with narg bytes of arguments starting
+// at argp and returning nret bytes of results.  callerpc is the
+// address of the go statement that created this.  The new g is put
+// on the queue of g's waiting to run.
+G*
+runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
+{
+	byte *sp;
+	G *newg;
+	P *p;
+	int32 siz;
+
+	if(fn == nil) {
+		g->m->throwing = -1;  // do not dump full stacks
+		runtime·throw("go of nil func value");
+	}
+	g->m->locks++;  // disable preemption because it can be holding p in a local var
+	siz = narg + nret;
+	siz = (siz+7) & ~7;
+
+	// We could allocate a larger initial stack if necessary.
+	// Not worth it: this is almost always an error.
+	// 4*sizeof(uintreg): extra space added below
+	// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
+	if(siz >= StackMin - 4*sizeof(uintreg) - sizeof(uintreg))
+		runtime·throw("runtime.newproc: function arguments too large for new goroutine");
+
+	p = g->m->p;
+	if((newg = gfget(p)) == nil) {
+		newg = runtime·malg(StackMin);
+		runtime·casgstatus(newg, Gidle, Gdead);
+		runtime·allgadd(newg); // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
+	}
+	if(newg->stack.hi == 0)
+		runtime·throw("newproc1: newg missing stack");
+
+	if(runtime·readgstatus(newg) != Gdead) 
+		runtime·throw("newproc1: new g is not Gdead");
+
+	sp = (byte*)newg->stack.hi;
+	sp -= 4*sizeof(uintreg); // extra space in case of reads slightly beyond frame
+	sp -= siz;
+	runtime·memmove(sp, argp, narg);
+	if(thechar == '5') {
+		// caller's LR
+		sp -= sizeof(void*);
+		*(void**)sp = nil;
+	}
+
+	runtime·memclr((byte*)&newg->sched, sizeof newg->sched);
+	newg->sched.sp = (uintptr)sp;
+	newg->sched.pc = (uintptr)runtime·goexit + PCQuantum; // +PCQuantum so that previous instruction is in same function
+	newg->sched.g = newg;
+	runtime·gostartcallfn(&newg->sched, fn);
+	newg->gopc = (uintptr)callerpc;
+	runtime·casgstatus(newg, Gdead, Grunnable);
+
+	if(p->goidcache == p->goidcacheend) {
+		// Sched.goidgen is the last allocated id,
+		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
+		// At startup sched.goidgen=0, so main goroutine receives goid=1.
+		p->goidcache = runtime·xadd64(&runtime·sched.goidgen, GoidCacheBatch);
+		p->goidcache -= GoidCacheBatch - 1;
+		p->goidcacheend = p->goidcache + GoidCacheBatch;
+	}
+	newg->goid = p->goidcache++;
+	if(raceenabled)
+		newg->racectx = runtime·racegostart((void*)callerpc);
+	runqput(p, newg);
+
+	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0 && fn->fn != runtime·main)  // TODO: fast atomic
+		wakep();
+	g->m->locks--;
+	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
+		g->stackguard0 = StackPreempt;
+	return newg;
+}
+
+// Put on gfree list.
+// If local list is too long, transfer a batch to the global list.
+static void
+gfput(P *p, G *gp)
+{
+	uintptr stksize;
+
+	if(runtime·readgstatus(gp) != Gdead) 
+		runtime·throw("gfput: bad status (not Gdead)");
+
+	stksize = gp->stack.hi - gp->stack.lo;
+	
+	if(stksize != FixedStack) {
+		// non-standard stack size - free it.
+		runtime·stackfree(gp->stack);
+		gp->stack.lo = 0;
+		gp->stack.hi = 0;
+		gp->stackguard0 = 0;
+	}
+	gp->schedlink = p->gfree;
+	p->gfree = gp;
+	p->gfreecnt++;
+	if(p->gfreecnt >= 64) {
+		runtime·lock(&runtime·sched.gflock);
+		while(p->gfreecnt >= 32) {
+			p->gfreecnt--;
+			gp = p->gfree;
+			p->gfree = gp->schedlink;
+			gp->schedlink = runtime·sched.gfree;
+			runtime·sched.gfree = gp;
+			runtime·sched.ngfree++;
+		}
+		runtime·unlock(&runtime·sched.gflock);
+	}
+}
+
+// Get from gfree list.
+// If local list is empty, grab a batch from global list.
+static G*
+gfget(P *p)
+{
+	G *gp;
+	void (*fn)(G*);
+
+retry:
+	gp = p->gfree;
+	if(gp == nil && runtime·sched.gfree) {
+		runtime·lock(&runtime·sched.gflock);
+		while(p->gfreecnt < 32 && runtime·sched.gfree != nil) {
+			p->gfreecnt++;
+			gp = runtime·sched.gfree;
+			runtime·sched.gfree = gp->schedlink;
+			runtime·sched.ngfree--;
+			gp->schedlink = p->gfree;
+			p->gfree = gp;
+		}
+		runtime·unlock(&runtime·sched.gflock);
+		goto retry;
+	}
+	if(gp) {
+		p->gfree = gp->schedlink;
+		p->gfreecnt--;
+
+		if(gp->stack.lo == 0) {
+			// Stack was deallocated in gfput.  Allocate a new one.
+			if(g == g->m->g0) {
+				gp->stack = runtime·stackalloc(FixedStack);
+			} else {
+				g->m->scalararg[0] = FixedStack;
+				g->m->ptrarg[0] = gp;
+				fn = mstackalloc;
+				runtime·mcall(&fn);
+				g->m->ptrarg[0] = nil;
+			}
+			gp->stackguard0 = gp->stack.lo + StackGuard;
+		} else {
+			if(raceenabled)
+				runtime·racemalloc((void*)gp->stack.lo, gp->stack.hi - gp->stack.lo);
+		}
+	}
+	return gp;
+}
+
+// Purge all cached G's from gfree list to the global list.
+static void
+gfpurge(P *p)
+{
+	G *gp;
+
+	runtime·lock(&runtime·sched.gflock);
+	while(p->gfreecnt != 0) {
+		p->gfreecnt--;
+		gp = p->gfree;
+		p->gfree = gp->schedlink;
+		gp->schedlink = runtime·sched.gfree;
+		runtime·sched.gfree = gp;
+		runtime·sched.ngfree++;
+	}
+	runtime·unlock(&runtime·sched.gflock);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·Breakpoint(void)
+{
+	runtime·breakpoint();
+}
+
+// lockOSThread is called by runtime.LockOSThread and runtime.lockOSThread below
+// after they modify m->locked. Do not allow preemption during this call,
+// or else the m might be different in this function than in the caller.
+#pragma textflag NOSPLIT
+static void
+lockOSThread(void)
+{
+	g->m->lockedg = g;
+	g->lockedm = g->m;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·LockOSThread(void)
+{
+	g->m->locked |= LockExternal;
+	lockOSThread();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·lockOSThread(void)
+{
+	g->m->locked += LockInternal;
+	lockOSThread();
+}
+
+
+// unlockOSThread is called by runtime.UnlockOSThread and runtime.unlockOSThread below
+// after they update m->locked. Do not allow preemption during this call,
+// or else the m might be in different in this function than in the caller.
+#pragma textflag NOSPLIT
+static void
+unlockOSThread(void)
+{
+	if(g->m->locked != 0)
+		return;
+	g->m->lockedg = nil;
+	g->lockedm = nil;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·UnlockOSThread(void)
+{
+	g->m->locked &= ~LockExternal;
+	unlockOSThread();
+}
+
+static void badunlockOSThread(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·unlockOSThread(void)
+{
+	void (*fn)(void);
+
+	if(g->m->locked < LockInternal) {
+		fn = badunlockOSThread;
+		runtime·onM(&fn);
+	}
+	g->m->locked -= LockInternal;
+	unlockOSThread();
+}
+
+static void
+badunlockOSThread(void)
+{
+	runtime·throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
+}
+
+#pragma textflag NOSPLIT
+int32
+runtime·gcount(void)
+{
+	P *p, **pp;
+	int32 n;
+
+	n = runtime·allglen - runtime·sched.ngfree;
+	for(pp=runtime·allp; p=*pp; pp++)
+		n -= p->gfreecnt;
+	// All these variables can be changed concurrently, so the result can be inconsistent.
+	// But at least the current goroutine is running.
+	if(n < 1)
+		n = 1;
+	return n;
+}
+
+int32
+runtime·mcount(void)
+{
+	return runtime·sched.mcount;
+}
+
+static struct ProfState {
+	uint32 lock;
+	int32 hz;
+} prof;
+
+static void System(void) { System(); }
+static void ExternalCode(void) { ExternalCode(); }
+static void GC(void) { GC(); }
+
+extern void runtime·cpuproftick(uintptr*, int32);
+extern byte runtime·etext[];
+
+// Called if we receive a SIGPROF signal.
+void
+runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp, M *mp)
+{
+	int32 n;
+	bool traceback;
+	// Do not use global m in this function, use mp instead.
+	// On windows one m is sending reports about all the g's, so m means a wrong thing.
+	byte m;
+	uintptr stk[100];
+
+	m = 0;
+	USED(m);
+
+	if(prof.hz == 0)
+		return;
+
+	// Profiling runs concurrently with GC, so it must not allocate.
+	mp->mallocing++;
+
+	// Define that a "user g" is a user-created goroutine, and a "system g"
+	// is one that is m->g0 or m->gsignal. We've only made sure that we
+	// can unwind user g's, so exclude the system g's.
+	//
+	// It is not quite as easy as testing gp == m->curg (the current user g)
+	// because we might be interrupted for profiling halfway through a
+	// goroutine switch. The switch involves updating three (or four) values:
+	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
+	// because once it gets updated the new g is running.
+	//
+	// When switching from a user g to a system g, LR is not considered live,
+	// so the update only affects g, SP, and PC. Since PC must be last, there
+	// the possible partial transitions in ordinary execution are (1) g alone is updated,
+	// (2) both g and SP are updated, and (3) SP alone is updated.
+	// If g is updated, we'll see a system g and not look closer.
+	// If SP alone is updated, we can detect the partial transition by checking
+	// whether the SP is within g's stack bounds. (We could also require that SP
+	// be changed only after g, but the stack bounds check is needed by other
+	// cases, so there is no need to impose an additional requirement.)
+	//
+	// There is one exceptional transition to a system g, not in ordinary execution.
+	// When a signal arrives, the operating system starts the signal handler running
+	// with an updated PC and SP. The g is updated last, at the beginning of the
+	// handler. There are two reasons this is okay. First, until g is updated the
+	// g and SP do not match, so the stack bounds check detects the partial transition.
+	// Second, signal handlers currently run with signals disabled, so a profiling
+	// signal cannot arrive during the handler.
+	//
+	// When switching from a system g to a user g, there are three possibilities.
+	//
+	// First, it may be that the g switch has no PC update, because the SP
+	// either corresponds to a user g throughout (as in runtime.asmcgocall)
+	// or because it has been arranged to look like a user g frame
+	// (as in runtime.cgocallback_gofunc). In this case, since the entire
+	// transition is a g+SP update, a partial transition updating just one of 
+	// those will be detected by the stack bounds check.
+	//
+	// Second, when returning from a signal handler, the PC and SP updates
+	// are performed by the operating system in an atomic update, so the g
+	// update must be done before them. The stack bounds check detects
+	// the partial transition here, and (again) signal handlers run with signals
+	// disabled, so a profiling signal cannot arrive then anyway.
+	//
+	// Third, the common case: it may be that the switch updates g, SP, and PC
+	// separately, as in runtime.gogo.
+	//
+	// Because runtime.gogo is the only instance, we check whether the PC lies
+	// within that function, and if so, not ask for a traceback. This approach
+	// requires knowing the size of the runtime.gogo function, which we
+	// record in arch_*.h and check in runtime_test.go.
+	//
+	// There is another apparently viable approach, recorded here in case
+	// the "PC within runtime.gogo" check turns out not to be usable.
+	// It would be possible to delay the update of either g or SP until immediately
+	// before the PC update instruction. Then, because of the stack bounds check,
+	// the only problematic interrupt point is just before that PC update instruction,
+	// and the sigprof handler can detect that instruction and simulate stepping past
+	// it in order to reach a consistent state. On ARM, the update of g must be made
+	// in two places (in R10 and also in a TLS slot), so the delayed update would
+	// need to be the SP update. The sigprof handler must read the instruction at
+	// the current PC and if it was the known instruction (for example, JMP BX or 
+	// MOV R2, PC), use that other register in place of the PC value.
+	// The biggest drawback to this solution is that it requires that we can tell
+	// whether it's safe to read from the memory pointed at by PC.
+	// In a correct program, we can test PC == nil and otherwise read,
+	// but if a profiling signal happens at the instant that a program executes
+	// a bad jump (before the program manages to handle the resulting fault)
+	// the profiling handler could fault trying to read nonexistent memory.
+	//
+	// To recap, there are no constraints on the assembly being used for the
+	// transition. We simply require that g and SP match and that the PC is not
+	// in runtime.gogo.
+	traceback = true;
+	if(gp == nil || gp != mp->curg ||
+	   (uintptr)sp < gp->stack.lo || gp->stack.hi < (uintptr)sp ||
+	   ((uint8*)runtime·gogo <= pc && pc < (uint8*)runtime·gogo + RuntimeGogoBytes))
+		traceback = false;
+
+	n = 0;
+	if(traceback)
+		n = runtime·gentraceback((uintptr)pc, (uintptr)sp, (uintptr)lr, gp, 0, stk, nelem(stk), nil, nil, TraceTrap);
+	if(!traceback || n <= 0) {
+		// Normal traceback is impossible or has failed.
+		// See if it falls into several common cases.
+		n = 0;
+		if(mp->ncgo > 0 && mp->curg != nil &&
+			mp->curg->syscallpc != 0 && mp->curg->syscallsp != 0) {
+			// Cgo, we can't unwind and symbolize arbitrary C code,
+			// so instead collect Go stack that leads to the cgo call.
+			// This is especially important on windows, since all syscalls are cgo calls.
+			n = runtime·gentraceback(mp->curg->syscallpc, mp->curg->syscallsp, 0, mp->curg, 0, stk, nelem(stk), nil, nil, 0);
+		}
+#ifdef GOOS_windows
+		if(n == 0 && mp->libcallg != nil && mp->libcallpc != 0 && mp->libcallsp != 0) {
+			// Libcall, i.e. runtime syscall on windows.
+			// Collect Go stack that leads to the call.
+			n = runtime·gentraceback(mp->libcallpc, mp->libcallsp, 0, mp->libcallg, 0, stk, nelem(stk), nil, nil, 0);
+		}
+#endif
+		if(n == 0) {
+			// If all of the above has failed, account it against abstract "System" or "GC".
+			n = 2;
+			// "ExternalCode" is better than "etext".
+			if((uintptr)pc > (uintptr)runtime·etext)
+				pc = (byte*)ExternalCode + PCQuantum;
+			stk[0] = (uintptr)pc;
+			if(mp->gcing || mp->helpgc)
+				stk[1] = (uintptr)GC + PCQuantum;
+			else
+				stk[1] = (uintptr)System + PCQuantum;
+		}
+	}
+
+	if(prof.hz != 0) {
+		// Simple cas-lock to coordinate with setcpuprofilerate.
+		while(!runtime·cas(&prof.lock, 0, 1))
+			runtime·osyield();
+		if(prof.hz != 0)
+			runtime·cpuproftick(stk, n);
+		runtime·atomicstore(&prof.lock, 0);
+	}
+	mp->mallocing--;
+}
+
+// Arrange to call fn with a traceback hz times a second.
+void
+runtime·setcpuprofilerate_m(void)
+{
+	int32 hz;
+	
+	hz = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+
+	// Force sane arguments.
+	if(hz < 0)
+		hz = 0;
+
+	// Disable preemption, otherwise we can be rescheduled to another thread
+	// that has profiling enabled.
+	g->m->locks++;
+
+	// Stop profiler on this thread so that it is safe to lock prof.
+	// if a profiling signal came in while we had prof locked,
+	// it would deadlock.
+	runtime·resetcpuprofiler(0);
+
+	while(!runtime·cas(&prof.lock, 0, 1))
+		runtime·osyield();
+	prof.hz = hz;
+	runtime·atomicstore(&prof.lock, 0);
+
+	runtime·lock(&runtime·sched.lock);
+	runtime·sched.profilehz = hz;
+	runtime·unlock(&runtime·sched.lock);
+
+	if(hz != 0)
+		runtime·resetcpuprofiler(hz);
+
+	g->m->locks--;
+}
+
+P *runtime·newP(void);
+
+// Change number of processors.  The world is stopped, sched is locked.
+static void
+procresize(int32 new)
+{
+	int32 i, old;
+	bool empty;
+	G *gp;
+	P *p;
+
+	old = runtime·gomaxprocs;
+	if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
+		runtime·throw("procresize: invalid arg");
+	// initialize new P's
+	for(i = 0; i < new; i++) {
+		p = runtime·allp[i];
+		if(p == nil) {
+			p = runtime·newP();
+			p->id = i;
+			p->status = Pgcstop;
+			runtime·atomicstorep(&runtime·allp[i], p);
+		}
+		if(p->mcache == nil) {
+			if(old==0 && i==0)
+				p->mcache = g->m->mcache;  // bootstrap
+			else
+				p->mcache = runtime·allocmcache();
+		}
+	}
+
+	// redistribute runnable G's evenly
+	// collect all runnable goroutines in global queue preserving FIFO order
+	// FIFO order is required to ensure fairness even during frequent GCs
+	// see http://golang.org/issue/7126
+	empty = false;
+	while(!empty) {
+		empty = true;
+		for(i = 0; i < old; i++) {
+			p = runtime·allp[i];
+			if(p->runqhead == p->runqtail)
+				continue;
+			empty = false;
+			// pop from tail of local queue
+			p->runqtail--;
+			gp = p->runq[p->runqtail%nelem(p->runq)];
+			// push onto head of global queue
+			gp->schedlink = runtime·sched.runqhead;
+			runtime·sched.runqhead = gp;
+			if(runtime·sched.runqtail == nil)
+				runtime·sched.runqtail = gp;
+			runtime·sched.runqsize++;
+		}
+	}
+	// fill local queues with at most nelem(p->runq)/2 goroutines
+	// start at 1 because current M already executes some G and will acquire allp[0] below,
+	// so if we have a spare G we want to put it into allp[1].
+	for(i = 1; i < new * nelem(p->runq)/2 && runtime·sched.runqsize > 0; i++) {
+		gp = runtime·sched.runqhead;
+		runtime·sched.runqhead = gp->schedlink;
+		if(runtime·sched.runqhead == nil)
+			runtime·sched.runqtail = nil;
+		runtime·sched.runqsize--;
+		runqput(runtime·allp[i%new], gp);
+	}
+
+	// free unused P's
+	for(i = new; i < old; i++) {
+		p = runtime·allp[i];
+		runtime·freemcache(p->mcache);
+		p->mcache = nil;
+		gfpurge(p);
+		p->status = Pdead;
+		// can't free P itself because it can be referenced by an M in syscall
+	}
+
+	if(g->m->p)
+		g->m->p->m = nil;
+	g->m->p = nil;
+	g->m->mcache = nil;
+	p = runtime·allp[0];
+	p->m = nil;
+	p->status = Pidle;
+	acquirep(p);
+	for(i = new-1; i > 0; i--) {
+		p = runtime·allp[i];
+		p->status = Pidle;
+		pidleput(p);
+	}
+	runtime·atomicstore((uint32*)&runtime·gomaxprocs, new);
+}
+
+// Associate p and the current m.
+static void
+acquirep(P *p)
+{
+	if(g->m->p || g->m->mcache)
+		runtime·throw("acquirep: already in go");
+	if(p->m || p->status != Pidle) {
+		runtime·printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
+		runtime·throw("acquirep: invalid p state");
+	}
+	g->m->mcache = p->mcache;
+	g->m->p = p;
+	p->m = g->m;
+	p->status = Prunning;
+}
+
+// Disassociate p and the current m.
+static P*
+releasep(void)
+{
+	P *p;
+
+	if(g->m->p == nil || g->m->mcache == nil)
+		runtime·throw("releasep: invalid arg");
+	p = g->m->p;
+	if(p->m != g->m || p->mcache != g->m->mcache || p->status != Prunning) {
+		runtime·printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
+			g->m, g->m->p, p->m, g->m->mcache, p->mcache, p->status);
+		runtime·throw("releasep: invalid p state");
+	}
+	g->m->p = nil;
+	g->m->mcache = nil;
+	p->m = nil;
+	p->status = Pidle;
+	return p;
+}
+
+static void
+incidlelocked(int32 v)
+{
+	runtime·lock(&runtime·sched.lock);
+	runtime·sched.nmidlelocked += v;
+	if(v > 0)
+		checkdead();
+	runtime·unlock(&runtime·sched.lock);
+}
+
+// Check for deadlock situation.
+// The check is based on number of running M's, if 0 -> deadlock.
+static void
+checkdead(void)
+{
+	G *gp;
+	P *p;
+	M *mp;
+	int32 run, grunning, s;
+	uintptr i;
+
+	// -1 for sysmon
+	run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.nmidlelocked - 1;
+	if(run > 0)
+		return;
+	// If we are dying because of a signal caught on an already idle thread,
+	// freezetheworld will cause all running threads to block.
+	// And runtime will essentially enter into deadlock state,
+	// except that there is a thread that will call runtime·exit soon.
+	if(runtime·panicking > 0)
+		return;
+	if(run < 0) {
+		runtime·printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
+			runtime·sched.nmidle, runtime·sched.nmidlelocked, runtime·sched.mcount);
+		runtime·throw("checkdead: inconsistent counts");
+	}
+	grunning = 0;
+	runtime·lock(&runtime·allglock);
+	for(i = 0; i < runtime·allglen; i++) {
+		gp = runtime·allg[i];
+		if(gp->issystem)
+			continue;
+		s = runtime·readgstatus(gp);
+		switch(s&~Gscan) {
+		case Gwaiting:
+			grunning++;
+			break;
+		case Grunnable:
+		case Grunning:
+		case Gsyscall:
+			runtime·unlock(&runtime·allglock);
+			runtime·printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
+			runtime·throw("checkdead: runnable g");
+			break;
+		}
+	}
+	runtime·unlock(&runtime·allglock);
+	if(grunning == 0)  // possible if main goroutine calls runtime·Goexit()
+		runtime·throw("no goroutines (main called runtime.Goexit) - deadlock!");
+
+	// Maybe jump time forward for playground.
+	if((gp = runtime·timejump()) != nil) {
+		runtime·casgstatus(gp, Gwaiting, Grunnable);
+		globrunqput(gp);
+ 		p = pidleget();
+ 		if(p == nil)
+ 			runtime·throw("checkdead: no p for timer");
+ 		mp = mget();
+ 		if(mp == nil)
+ 			newm(nil, p);
+ 		else {
+ 			mp->nextp = p;
+ 			runtime·notewakeup(&mp->park);
+ 		}
+ 		return;
+ 	}
+
+	g->m->throwing = -1;  // do not dump full stacks
+	runtime·throw("all goroutines are asleep - deadlock!");
+}
+
+static void
+sysmon(void)
+{
+	uint32 idle, delay, nscavenge;
+	int64 now, unixnow, lastpoll, lasttrace, lastgc;
+	int64 forcegcperiod, scavengelimit, lastscavenge, maxsleep;
+	G *gp;
+
+	// If we go two minutes without a garbage collection, force one to run.
+	forcegcperiod = 2*60*1e9;
+	// If a heap span goes unused for 5 minutes after a garbage collection,
+	// we hand it back to the operating system.
+	scavengelimit = 5*60*1e9;
+	if(runtime·debug.scavenge > 0) {
+		// Scavenge-a-lot for testing.
+		forcegcperiod = 10*1e6;
+		scavengelimit = 20*1e6;
+	}
+	lastscavenge = runtime·nanotime();
+	nscavenge = 0;
+	// Make wake-up period small enough for the sampling to be correct.
+	maxsleep = forcegcperiod/2;
+	if(scavengelimit < forcegcperiod)
+		maxsleep = scavengelimit/2;
+
+	lasttrace = 0;
+	idle = 0;  // how many cycles in succession we had not wokeup somebody
+	delay = 0;
+	for(;;) {
+		if(idle == 0)  // start with 20us sleep...
+			delay = 20;
+		else if(idle > 50)  // start doubling the sleep after 1ms...
+			delay *= 2;
+		if(delay > 10*1000)  // up to 10ms
+			delay = 10*1000;
+		runtime·usleep(delay);
+		if(runtime·debug.schedtrace <= 0 &&
+			(runtime·sched.gcwaiting || runtime·atomicload(&runtime·sched.npidle) == runtime·gomaxprocs)) {  // TODO: fast atomic
+			runtime·lock(&runtime·sched.lock);
+			if(runtime·atomicload(&runtime·sched.gcwaiting) || runtime·atomicload(&runtime·sched.npidle) == runtime·gomaxprocs) {
+				runtime·atomicstore(&runtime·sched.sysmonwait, 1);
+				runtime·unlock(&runtime·sched.lock);
+				runtime·notetsleep(&runtime·sched.sysmonnote, maxsleep);
+				runtime·lock(&runtime·sched.lock);
+				runtime·atomicstore(&runtime·sched.sysmonwait, 0);
+				runtime·noteclear(&runtime·sched.sysmonnote);
+				idle = 0;
+				delay = 20;
+			}
+			runtime·unlock(&runtime·sched.lock);
+		}
+		// poll network if not polled for more than 10ms
+		lastpoll = runtime·atomicload64(&runtime·sched.lastpoll);
+		now = runtime·nanotime();
+		unixnow = runtime·unixnanotime();
+		if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
+			runtime·cas64(&runtime·sched.lastpoll, lastpoll, now);
+			gp = runtime·netpoll(false);  // non-blocking
+			if(gp) {
+				// Need to decrement number of idle locked M's
+				// (pretending that one more is running) before injectglist.
+				// Otherwise it can lead to the following situation:
+				// injectglist grabs all P's but before it starts M's to run the P's,
+				// another M returns from syscall, finishes running its G,
+				// observes that there is no work to do and no other running M's
+				// and reports deadlock.
+				incidlelocked(-1);
+				injectglist(gp);
+				incidlelocked(1);
+			}
+		}
+		// retake P's blocked in syscalls
+		// and preempt long running G's
+		if(retake(now))
+			idle = 0;
+		else
+			idle++;
+
+		// check if we need to force a GC
+		lastgc = runtime·atomicload64(&mstats.last_gc);
+		if(lastgc != 0 && unixnow - lastgc > forcegcperiod && runtime·atomicload(&runtime·forcegc.idle)) {
+			runtime·lock(&runtime·forcegc.lock);
+			runtime·forcegc.idle = 0;
+			runtime·forcegc.g->schedlink = nil;
+			injectglist(runtime·forcegc.g);
+			runtime·unlock(&runtime·forcegc.lock);
+		}
+
+		// scavenge heap once in a while
+		if(lastscavenge + scavengelimit/2 < now) {
+			runtime·MHeap_Scavenge(nscavenge, now, scavengelimit);
+			lastscavenge = now;
+			nscavenge++;
+		}
+
+		if(runtime·debug.schedtrace > 0 && lasttrace + runtime·debug.schedtrace*1000000ll <= now) {
+			lasttrace = now;
+			runtime·schedtrace(runtime·debug.scheddetail);
+		}
+	}
+}
+
+typedef struct Pdesc Pdesc;
+struct Pdesc
+{
+	uint32	schedtick;
+	int64	schedwhen;
+	uint32	syscalltick;
+	int64	syscallwhen;
+};
+#pragma dataflag NOPTR
+static Pdesc pdesc[MaxGomaxprocs];
+
+static uint32
+retake(int64 now)
+{
+	uint32 i, s, n;
+	int64 t;
+	P *p;
+	Pdesc *pd;
+
+	n = 0;
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p==nil)
+			continue;
+		pd = &pdesc[i];
+		s = p->status;
+		if(s == Psyscall) {
+			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
+			t = p->syscalltick;
+			if(pd->syscalltick != t) {
+				pd->syscalltick = t;
+				pd->syscallwhen = now;
+				continue;
+			}
+			// On the one hand we don't want to retake Ps if there is no other work to do,
+			// but on the other hand we want to retake them eventually
+			// because they can prevent the sysmon thread from deep sleep.
+			if(p->runqhead == p->runqtail &&
+				runtime·atomicload(&runtime·sched.nmspinning) + runtime·atomicload(&runtime·sched.npidle) > 0 &&
+				pd->syscallwhen + 10*1000*1000 > now)
+				continue;
+			// Need to decrement number of idle locked M's
+			// (pretending that one more is running) before the CAS.
+			// Otherwise the M from which we retake can exit the syscall,
+			// increment nmidle and report deadlock.
+			incidlelocked(-1);
+			if(runtime·cas(&p->status, s, Pidle)) {
+				n++;
+				handoffp(p);
+			}
+			incidlelocked(1);
+		} else if(s == Prunning) {
+			// Preempt G if it's running for more than 10ms.
+			t = p->schedtick;
+			if(pd->schedtick != t) {
+				pd->schedtick = t;
+				pd->schedwhen = now;
+				continue;
+			}
+			if(pd->schedwhen + 10*1000*1000 > now)
+				continue;
+			preemptone(p);
+		}
+	}
+	return n;
+}
+
+// Tell all goroutines that they have been preempted and they should stop.
+// This function is purely best-effort.  It can fail to inform a goroutine if a
+// processor just started running it.
+// No locks need to be held.
+// Returns true if preemption request was issued to at least one goroutine.
+static bool
+preemptall(void)
+{
+	P *p;
+	int32 i;
+	bool res;
+
+	res = false;
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p == nil || p->status != Prunning)
+			continue;
+		res |= preemptone(p);
+	}
+	return res;
+}
+
+// Tell the goroutine running on processor P to stop.
+// This function is purely best-effort.  It can incorrectly fail to inform the
+// goroutine.  It can send inform the wrong goroutine.  Even if it informs the
+// correct goroutine, that goroutine might ignore the request if it is
+// simultaneously executing runtime·newstack.
+// No lock needs to be held.
+// Returns true if preemption request was issued.
+// The actual preemption will happen at some point in the future
+// and will be indicated by the gp->status no longer being
+// Grunning
+static bool
+preemptone(P *p)
+{
+	M *mp;
+	G *gp;
+
+	mp = p->m;
+	if(mp == nil || mp == g->m)
+		return false;
+	gp = mp->curg;
+	if(gp == nil || gp == mp->g0)
+		return false;
+	gp->preempt = true;
+	// Every call in a go routine checks for stack overflow by
+	// comparing the current stack pointer to gp->stackguard0.
+	// Setting gp->stackguard0 to StackPreempt folds
+	// preemption into the normal stack overflow check.
+	gp->stackguard0 = StackPreempt;
+	return true;
+}
+
+void
+runtime·schedtrace(bool detailed)
+{
+	static int64 starttime;
+	int64 now;
+	int64 id1, id2, id3;
+	int32 i, t, h;
+	uintptr gi;
+	int8 *fmt;
+	M *mp, *lockedm;
+	G *gp, *lockedg;
+	P *p;
+
+	now = runtime·nanotime();
+	if(starttime == 0)
+		starttime = now;
+
+	runtime·lock(&runtime·sched.lock);
+	runtime·printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d spinningthreads=%d idlethreads=%d runqueue=%d",
+		(now-starttime)/1000000, runtime·gomaxprocs, runtime·sched.npidle, runtime·sched.mcount,
+		runtime·sched.nmspinning, runtime·sched.nmidle, runtime·sched.runqsize);
+	if(detailed) {
+		runtime·printf(" gcwaiting=%d nmidlelocked=%d stopwait=%d sysmonwait=%d\n",
+			runtime·sched.gcwaiting, runtime·sched.nmidlelocked,
+			runtime·sched.stopwait, runtime·sched.sysmonwait);
+	}
+	// We must be careful while reading data from P's, M's and G's.
+	// Even if we hold schedlock, most data can be changed concurrently.
+	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p == nil)
+			continue;
+		mp = p->m;
+		h = runtime·atomicload(&p->runqhead);
+		t = runtime·atomicload(&p->runqtail);
+		if(detailed)
+			runtime·printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n",
+				i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt);
+		else {
+			// In non-detailed mode format lengths of per-P run queues as:
+			// [len1 len2 len3 len4]
+			fmt = " %d";
+			if(runtime·gomaxprocs == 1)
+				fmt = " [%d]\n";
+			else if(i == 0)
+				fmt = " [%d";
+			else if(i == runtime·gomaxprocs-1)
+				fmt = " %d]\n";
+			runtime·printf(fmt, t-h);
+		}
+	}
+	if(!detailed) {
+		runtime·unlock(&runtime·sched.lock);
+		return;
+	}
+	for(mp = runtime·allm; mp; mp = mp->alllink) {
+		p = mp->p;
+		gp = mp->curg;
+		lockedg = mp->lockedg;
+		id1 = -1;
+		if(p)
+			id1 = p->id;
+		id2 = -1;
+		if(gp)
+			id2 = gp->goid;
+		id3 = -1;
+		if(lockedg)
+			id3 = lockedg->goid;
+		runtime·printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
+			" locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
+			mp->id, id1, id2,
+			mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
+			mp->spinning, g->m->blocked, id3);
+	}
+	runtime·lock(&runtime·allglock);
+	for(gi = 0; gi < runtime·allglen; gi++) {
+		gp = runtime·allg[gi];
+		mp = gp->m;
+		lockedm = gp->lockedm;
+		runtime·printf("  G%D: status=%d(%S) m=%d lockedm=%d\n",
+			gp->goid, runtime·readgstatus(gp), gp->waitreason, mp ? mp->id : -1,
+			lockedm ? lockedm->id : -1);
+	}
+	runtime·unlock(&runtime·allglock);
+	runtime·unlock(&runtime·sched.lock);
+}
+
+// Put mp on midle list.
+// Sched must be locked.
+static void
+mput(M *mp)
+{
+	mp->schedlink = runtime·sched.midle;
+	runtime·sched.midle = mp;
+	runtime·sched.nmidle++;
+	checkdead();
+}
+
+// Try to get an m from midle list.
+// Sched must be locked.
+static M*
+mget(void)
+{
+	M *mp;
+
+	if((mp = runtime·sched.midle) != nil){
+		runtime·sched.midle = mp->schedlink;
+		runtime·sched.nmidle--;
+	}
+	return mp;
+}
+
+// Put gp on the global runnable queue.
+// Sched must be locked.
+static void
+globrunqput(G *gp)
+{
+	gp->schedlink = nil;
+	if(runtime·sched.runqtail)
+		runtime·sched.runqtail->schedlink = gp;
+	else
+		runtime·sched.runqhead = gp;
+	runtime·sched.runqtail = gp;
+	runtime·sched.runqsize++;
+}
+
+// Put a batch of runnable goroutines on the global runnable queue.
+// Sched must be locked.
+static void
+globrunqputbatch(G *ghead, G *gtail, int32 n)
+{
+	gtail->schedlink = nil;
+	if(runtime·sched.runqtail)
+		runtime·sched.runqtail->schedlink = ghead;
+	else
+		runtime·sched.runqhead = ghead;
+	runtime·sched.runqtail = gtail;
+	runtime·sched.runqsize += n;
+}
+
+// Try get a batch of G's from the global runnable queue.
+// Sched must be locked.
+static G*
+globrunqget(P *p, int32 max)
+{
+	G *gp, *gp1;
+	int32 n;
+
+	if(runtime·sched.runqsize == 0)
+		return nil;
+	n = runtime·sched.runqsize/runtime·gomaxprocs+1;
+	if(n > runtime·sched.runqsize)
+		n = runtime·sched.runqsize;
+	if(max > 0 && n > max)
+		n = max;
+	if(n > nelem(p->runq)/2)
+		n = nelem(p->runq)/2;
+	runtime·sched.runqsize -= n;
+	if(runtime·sched.runqsize == 0)
+		runtime·sched.runqtail = nil;
+	gp = runtime·sched.runqhead;
+	runtime·sched.runqhead = gp->schedlink;
+	n--;
+	while(n--) {
+		gp1 = runtime·sched.runqhead;
+		runtime·sched.runqhead = gp1->schedlink;
+		runqput(p, gp1);
+	}
+	return gp;
+}
+
+// Put p to on pidle list.
+// Sched must be locked.
+static void
+pidleput(P *p)
+{
+	p->link = runtime·sched.pidle;
+	runtime·sched.pidle = p;
+	runtime·xadd(&runtime·sched.npidle, 1);  // TODO: fast atomic
+}
+
+// Try get a p from pidle list.
+// Sched must be locked.
+static P*
+pidleget(void)
+{
+	P *p;
+
+	p = runtime·sched.pidle;
+	if(p) {
+		runtime·sched.pidle = p->link;
+		runtime·xadd(&runtime·sched.npidle, -1);  // TODO: fast atomic
+	}
+	return p;
+}
+
+// Try to put g on local runnable queue.
+// If it's full, put onto global queue.
+// Executed only by the owner P.
+static void
+runqput(P *p, G *gp)
+{
+	uint32 h, t;
+
+retry:
+	h = runtime·atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
+	t = p->runqtail;
+	if(t - h < nelem(p->runq)) {
+		p->runq[t%nelem(p->runq)] = gp;
+		runtime·atomicstore(&p->runqtail, t+1);  // store-release, makes the item available for consumption
+		return;
+	}
+	if(runqputslow(p, gp, h, t))
+		return;
+	// the queue is not full, now the put above must suceed
+	goto retry;
+}
+
+// Put g and a batch of work from local runnable queue on global queue.
+// Executed only by the owner P.
+static bool
+runqputslow(P *p, G *gp, uint32 h, uint32 t)
+{
+	G *batch[nelem(p->runq)/2+1];
+	uint32 n, i;
+
+	// First, grab a batch from local queue.
+	n = t-h;
+	n = n/2;
+	if(n != nelem(p->runq)/2)
+		runtime·throw("runqputslow: queue is not full");
+	for(i=0; i<n; i++)
+		batch[i] = p->runq[(h+i)%nelem(p->runq)];
+	if(!runtime·cas(&p->runqhead, h, h+n))  // cas-release, commits consume
+		return false;
+	batch[n] = gp;
+	// Link the goroutines.
+	for(i=0; i<n; i++)
+		batch[i]->schedlink = batch[i+1];
+	// Now put the batch on global queue.
+	runtime·lock(&runtime·sched.lock);
+	globrunqputbatch(batch[0], batch[n], n+1);
+	runtime·unlock(&runtime·sched.lock);
+	return true;
+}
+
+// Get g from local runnable queue.
+// Executed only by the owner P.
+static G*
+runqget(P *p)
+{
+	G *gp;
+	uint32 t, h;
+
+	for(;;) {
+		h = runtime·atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
+		t = p->runqtail;
+		if(t == h)
+			return nil;
+		gp = p->runq[h%nelem(p->runq)];
+		if(runtime·cas(&p->runqhead, h, h+1))  // cas-release, commits consume
+			return gp;
+	}
+}
+
+// Grabs a batch of goroutines from local runnable queue.
+// batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines.
+// Can be executed by any P.
+static uint32
+runqgrab(P *p, G **batch)
+{
+	uint32 t, h, n, i;
+
+	for(;;) {
+		h = runtime·atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
+		t = runtime·atomicload(&p->runqtail);  // load-acquire, synchronize with the producer
+		n = t-h;
+		n = n - n/2;
+		if(n == 0)
+			break;
+		if(n > nelem(p->runq)/2)  // read inconsistent h and t
+			continue;
+		for(i=0; i<n; i++)
+			batch[i] = p->runq[(h+i)%nelem(p->runq)];
+		if(runtime·cas(&p->runqhead, h, h+n))  // cas-release, commits consume
+			break;
+	}
+	return n;
+}
+
+// Steal half of elements from local runnable queue of p2
+// and put onto local runnable queue of p.
+// Returns one of the stolen elements (or nil if failed).
+static G*
+runqsteal(P *p, P *p2)
+{
+	G *gp;
+	G *batch[nelem(p->runq)/2];
+	uint32 t, h, n, i;
+
+	n = runqgrab(p2, batch);
+	if(n == 0)
+		return nil;
+	n--;
+	gp = batch[n];
+	if(n == 0)
+		return gp;
+	h = runtime·atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
+	t = p->runqtail;
+	if(t - h + n >= nelem(p->runq))
+		runtime·throw("runqsteal: runq overflow");
+	for(i=0; i<n; i++, t++)
+		p->runq[t%nelem(p->runq)] = batch[i];
+	runtime·atomicstore(&p->runqtail, t);  // store-release, makes the item available for consumption
+	return gp;
+}
+
+void
+runtime·testSchedLocalQueue(void)
+{
+	P *p;
+	G *gs;
+	int32 i, j;
+
+	p = (P*)runtime·mallocgc(sizeof(*p), nil, FlagNoScan);
+	gs = (G*)runtime·mallocgc(nelem(p->runq)*sizeof(*gs), nil, FlagNoScan);
+
+	for(i = 0; i < nelem(p->runq); i++) {
+		if(runqget(p) != nil)
+			runtime·throw("runq is not empty initially");
+		for(j = 0; j < i; j++)
+			runqput(p, &gs[i]);
+		for(j = 0; j < i; j++) {
+			if(runqget(p) != &gs[i]) {
+				runtime·printf("bad element at iter %d/%d\n", i, j);
+				runtime·throw("bad element");
+			}
+		}
+		if(runqget(p) != nil)
+			runtime·throw("runq is not empty afterwards");
+	}
+}
+
+void
+runtime·testSchedLocalQueueSteal(void)
+{
+	P *p1, *p2;
+	G *gs, *gp;
+	int32 i, j, s;
+
+	p1 = (P*)runtime·mallocgc(sizeof(*p1), nil, FlagNoScan);
+	p2 = (P*)runtime·mallocgc(sizeof(*p2), nil, FlagNoScan);
+	gs = (G*)runtime·mallocgc(nelem(p1->runq)*sizeof(*gs), nil, FlagNoScan);
+
+	for(i = 0; i < nelem(p1->runq); i++) {
+		for(j = 0; j < i; j++) {
+			gs[j].sig = 0;
+			runqput(p1, &gs[j]);
+		}
+		gp = runqsteal(p2, p1);
+		s = 0;
+		if(gp) {
+			s++;
+			gp->sig++;
+		}
+		while(gp = runqget(p2)) {
+			s++;
+			gp->sig++;
+		}
+		while(gp = runqget(p1))
+			gp->sig++;
+		for(j = 0; j < i; j++) {
+			if(gs[j].sig != 1) {
+				runtime·printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
+				runtime·throw("bad element");
+			}
+		}
+		if(s != i/2 && s != i/2+1) {
+			runtime·printf("bad steal %d, want %d or %d, iter %d\n",
+				s, i/2, i/2+1, i);
+			runtime·throw("bad steal");
+		}
+	}
+}
+
+void
+runtime·setmaxthreads_m(void)
+{
+	int32 in;
+	int32 out;
+
+	in = g->m->scalararg[0];
+
+	runtime·lock(&runtime·sched.lock);
+	out = runtime·sched.maxmcount;
+	runtime·sched.maxmcount = in;
+	checkmcount();
+	runtime·unlock(&runtime·sched.lock);
+
+	g->m->scalararg[0] = out;
+}
+
+static int8 experiment[] = GOEXPERIMENT; // defined in zaexperiment.h
+
+static bool
+haveexperiment(int8 *name)
+{
+	int32 i, j;
+	
+	for(i=0; i<sizeof(experiment); i++) {
+		if((i == 0 || experiment[i-1] == ',') && experiment[i] == name[0]) {
+			for(j=0; name[j]; j++)
+				if(experiment[i+j] != name[j])
+					goto nomatch;
+			if(experiment[i+j] != '\0' && experiment[i+j] != ',')
+				goto nomatch;
+			return 1;
+		}
+	nomatch:;
+	}
+	return 0;
+}
+
+#pragma textflag NOSPLIT
+void
+sync·runtime_procPin(intptr p)
+{
+	M *mp;
+
+	mp = g->m;
+	// Disable preemption.
+	mp->locks++;
+	p = mp->p->id;
+	FLUSH(&p);
+}
+
+#pragma textflag NOSPLIT
+void
+sync·runtime_procUnpin()
+{
+	g->m->locks--;
+}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
new file mode 100644
index 000000000..517ca03df
--- /dev/null
+++ b/src/runtime/proc.go
@@ -0,0 +1,246 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func newsysmon()
+
+func runtime_init()
+func main_init()
+func main_main()
+
+// The main goroutine.
+func main() {
+	g := getg()
+
+	// Racectx of m0->g0 is used only as the parent of the main goroutine.
+	// It must not be used for anything else.
+	g.m.g0.racectx = 0
+
+	// Max stack size is 1 GB on 64-bit, 250 MB on 32-bit.
+	// Using decimal instead of binary GB and MB because
+	// they look nicer in the stack overflow failure message.
+	if ptrSize == 8 {
+		maxstacksize = 1000000000
+	} else {
+		maxstacksize = 250000000
+	}
+
+	onM(newsysmon)
+
+	// Lock the main goroutine onto this, the main OS thread,
+	// during initialization.  Most programs won't care, but a few
+	// do require certain calls to be made by the main thread.
+	// Those can arrange for main.main to run in the main thread
+	// by calling runtime.LockOSThread during initialization
+	// to preserve the lock.
+	lockOSThread()
+
+	if g.m != &m0 {
+		gothrow("runtime.main not on m0")
+	}
+
+	runtime_init() // must be before defer
+
+	// Defer unlock so that runtime.Goexit during init does the unlock too.
+	needUnlock := true
+	defer func() {
+		if needUnlock {
+			unlockOSThread()
+		}
+	}()
+
+	memstats.enablegc = true // now that runtime is initialized, GC is okay
+
+	main_init()
+
+	needUnlock = false
+	unlockOSThread()
+
+	main_main()
+	if raceenabled {
+		racefini()
+	}
+
+	// Make racy client program work: if panicking on
+	// another goroutine at the same time as main returns,
+	// let the other goroutine finish printing the panic trace.
+	// Once it does, it will exit. See issue 3934.
+	if panicking != 0 {
+		gopark(nil, nil, "panicwait")
+	}
+
+	exit(0)
+	for {
+		var x *int32
+		*x = 0
+	}
+}
+
+var parkunlock_c byte
+
+// start forcegc helper goroutine
+func init() {
+	go forcegchelper()
+}
+
+func forcegchelper() {
+	forcegc.g = getg()
+	forcegc.g.issystem = true
+	for {
+		lock(&forcegc.lock)
+		if forcegc.idle != 0 {
+			gothrow("forcegc: phase error")
+		}
+		atomicstore(&forcegc.idle, 1)
+		goparkunlock(&forcegc.lock, "force gc (idle)")
+		// this goroutine is explicitly resumed by sysmon
+		if debug.gctrace > 0 {
+			println("GC forced")
+		}
+		gogc(1)
+	}
+}
+
+//go:nosplit
+
+// Gosched yields the processor, allowing other goroutines to run.  It does not
+// suspend the current goroutine, so execution resumes automatically.
+func Gosched() {
+	mcall(gosched_m)
+}
+
+// Puts the current goroutine into a waiting state and calls unlockf.
+// If unlockf returns false, the goroutine is resumed.
+func gopark(unlockf unsafe.Pointer, lock unsafe.Pointer, reason string) {
+	mp := acquirem()
+	gp := mp.curg
+	status := readgstatus(gp)
+	if status != _Grunning && status != _Gscanrunning {
+		gothrow("gopark: bad g status")
+	}
+	mp.waitlock = lock
+	mp.waitunlockf = unlockf
+	gp.waitreason = reason
+	releasem(mp)
+	// can't do anything that might move the G between Ms here.
+	mcall(park_m)
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling goready(gp).
+func goparkunlock(lock *mutex, reason string) {
+	gopark(unsafe.Pointer(&parkunlock_c), unsafe.Pointer(lock), reason)
+}
+
+func goready(gp *g) {
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(gp)
+	onM(ready_m)
+	releasem(mp)
+}
+
+//go:nosplit
+func acquireSudog() *sudog {
+	c := gomcache()
+	s := c.sudogcache
+	if s != nil {
+		if s.elem != nil {
+			gothrow("acquireSudog: found s.elem != nil in cache")
+		}
+		c.sudogcache = s.next
+		s.next = nil
+		return s
+	}
+
+	// Delicate dance: the semaphore implementation calls
+	// acquireSudog, acquireSudog calls new(sudog),
+	// new calls malloc, malloc can call the garbage collector,
+	// and the garbage collector calls the semaphore implementation
+	// in stoptheworld.
+	// Break the cycle by doing acquirem/releasem around new(sudog).
+	// The acquirem/releasem increments m.locks during new(sudog),
+	// which keeps the garbage collector from being invoked.
+	mp := acquirem()
+	p := new(sudog)
+	releasem(mp)
+	return p
+}
+
+//go:nosplit
+func releaseSudog(s *sudog) {
+	if s.elem != nil {
+		gothrow("runtime: sudog with non-nil elem")
+	}
+	if s.selectdone != nil {
+		gothrow("runtime: sudog with non-nil selectdone")
+	}
+	if s.next != nil {
+		gothrow("runtime: sudog with non-nil next")
+	}
+	if s.prev != nil {
+		gothrow("runtime: sudog with non-nil prev")
+	}
+	if s.waitlink != nil {
+		gothrow("runtime: sudog with non-nil waitlink")
+	}
+	gp := getg()
+	if gp.param != nil {
+		gothrow("runtime: releaseSudog with non-nil gp.param")
+	}
+	c := gomcache()
+	s.next = c.sudogcache
+	c.sudogcache = s
+}
+
+// funcPC returns the entry PC of the function f.
+// It assumes that f is a func value. Otherwise the behavior is undefined.
+//go:nosplit
+func funcPC(f interface{}) uintptr {
+	return **(**uintptr)(add(unsafe.Pointer(&f), ptrSize))
+}
+
+// called from assembly
+func badmcall(fn func(*g)) {
+	gothrow("runtime: mcall called on m->g0 stack")
+}
+
+func badmcall2(fn func(*g)) {
+	gothrow("runtime: mcall function returned")
+}
+
+func badreflectcall() {
+	panic("runtime: arg size to reflect.call more than 1GB")
+}
+
+func lockedOSThread() bool {
+	gp := getg()
+	return gp.lockedm != nil && gp.m.lockedg != nil
+}
+
+func newP() *p {
+	return new(p)
+}
+
+func newM() *m {
+	return new(m)
+}
+
+func newG() *g {
+	return new(g)
+}
+
+func allgadd(gp *g) {
+	if readgstatus(gp) == _Gidle {
+		gothrow("allgadd: bad status Gidle")
+	}
+
+	lock(&allglock)
+	allgs = append(allgs, gp)
+	allg = &allgs[0]
+	allglen = uintptr(len(allgs))
+	unlock(&allglock)
+}
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
new file mode 100644
index 000000000..aa9bc81ac
--- /dev/null
+++ b/src/runtime/proc_test.go
@@ -0,0 +1,480 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math"
+	"runtime"
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"time"
+)
+
+var stop = make(chan bool, 1)
+
+func perpetuumMobile() {
+	select {
+	case <-stop:
+	default:
+		go perpetuumMobile()
+	}
+}
+
+func TestStopTheWorldDeadlock(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping during short test")
+	}
+	maxprocs := runtime.GOMAXPROCS(3)
+	compl := make(chan bool, 2)
+	go func() {
+		for i := 0; i != 1000; i += 1 {
+			runtime.GC()
+		}
+		compl <- true
+	}()
+	go func() {
+		for i := 0; i != 1000; i += 1 {
+			runtime.GOMAXPROCS(3)
+		}
+		compl <- true
+	}()
+	go perpetuumMobile()
+	<-compl
+	<-compl
+	stop <- true
+	runtime.GOMAXPROCS(maxprocs)
+}
+
+func TestYieldProgress(t *testing.T) {
+	testYieldProgress(t, false)
+}
+
+func TestYieldLockedProgress(t *testing.T) {
+	testYieldProgress(t, true)
+}
+
+func testYieldProgress(t *testing.T, locked bool) {
+	c := make(chan bool)
+	cack := make(chan bool)
+	go func() {
+		if locked {
+			runtime.LockOSThread()
+		}
+		for {
+			select {
+			case <-c:
+				cack <- true
+				return
+			default:
+				runtime.Gosched()
+			}
+		}
+	}()
+	time.Sleep(10 * time.Millisecond)
+	c <- true
+	<-cack
+}
+
+func TestYieldLocked(t *testing.T) {
+	const N = 10
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		for i := 0; i < N; i++ {
+			runtime.Gosched()
+			time.Sleep(time.Millisecond)
+		}
+		c <- true
+		// runtime.UnlockOSThread() is deliberately omitted
+	}()
+	<-c
+}
+
+func TestGoroutineParallelism(t *testing.T) {
+	P := 4
+	N := 10
+	if testing.Short() {
+		P = 3
+		N = 3
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P))
+	// If runtime triggers a forced GC during this test then it will deadlock,
+	// since the goroutines can't be stopped/preempted.
+	// So give this test as much time as possible.
+	runtime.GC()
+	for try := 0; try < N; try++ {
+		done := make(chan bool)
+		x := uint32(0)
+		for p := 0; p < P; p++ {
+			// Test that all P goroutines are scheduled at the same time
+			go func(p int) {
+				for i := 0; i < 3; i++ {
+					expected := uint32(P*i + p)
+					for atomic.LoadUint32(&x) != expected {
+					}
+					atomic.StoreUint32(&x, expected+1)
+				}
+				done <- true
+			}(p)
+		}
+		for p := 0; p < P; p++ {
+			<-done
+		}
+	}
+}
+
+func TestBlockLocked(t *testing.T) {
+	const N = 10
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		for i := 0; i < N; i++ {
+			c <- true
+		}
+		runtime.UnlockOSThread()
+	}()
+	for i := 0; i < N; i++ {
+		<-c
+	}
+}
+
+func TestTimerFairness(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	for i := 0; i < 2; i++ {
+		go func() {
+			for {
+				select {
+				case c <- true:
+				case <-done:
+					return
+				}
+			}
+		}()
+	}
+
+	timer := time.After(20 * time.Millisecond)
+	for {
+		select {
+		case <-c:
+		case <-timer:
+			close(done)
+			return
+		}
+	}
+}
+
+func TestTimerFairness2(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	for i := 0; i < 2; i++ {
+		go func() {
+			timer := time.After(20 * time.Millisecond)
+			var buf [1]byte
+			for {
+				syscall.Read(0, buf[0:0])
+				select {
+				case c <- true:
+				case <-c:
+				case <-timer:
+					done <- true
+					return
+				}
+			}
+		}()
+	}
+	<-done
+	<-done
+}
+
+// The function is used to test preemption at split stack checks.
+// Declaring a var avoids inlining at the call site.
+var preempt = func() int {
+	var a [128]int
+	sum := 0
+	for _, v := range a {
+		sum += v
+	}
+	return sum
+}
+
+func TestPreemption(t *testing.T) {
+	// Test that goroutines are preempted at function calls.
+	N := 5
+	if testing.Short() {
+		N = 2
+	}
+	c := make(chan bool)
+	var x uint32
+	for g := 0; g < 2; g++ {
+		go func(g int) {
+			for i := 0; i < N; i++ {
+				for atomic.LoadUint32(&x) != uint32(g) {
+					preempt()
+				}
+				atomic.StoreUint32(&x, uint32(1-g))
+			}
+			c <- true
+		}(g)
+	}
+	<-c
+	<-c
+}
+
+func TestPreemptionGC(t *testing.T) {
+	// Test that pending GC preempts running goroutines.
+	P := 5
+	N := 10
+	if testing.Short() {
+		P = 3
+		N = 2
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P + 1))
+	var stop uint32
+	for i := 0; i < P; i++ {
+		go func() {
+			for atomic.LoadUint32(&stop) == 0 {
+				preempt()
+			}
+		}()
+	}
+	for i := 0; i < N; i++ {
+		runtime.Gosched()
+		runtime.GC()
+	}
+	atomic.StoreUint32(&stop, 1)
+}
+
+func TestGCFairness(t *testing.T) {
+	output := executeTest(t, testGCFairnessSource, nil)
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
+
+const testGCFairnessSource = `
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"time"
+)
+
+func main() {
+	runtime.GOMAXPROCS(1)
+	f, err := os.Open("/dev/null")
+	if os.IsNotExist(err) {
+		// This test tests what it is intended to test only if writes are fast.
+		// If there is no /dev/null, we just don't execute the test.
+		fmt.Println("OK")
+		return
+	}
+	if err != nil {
+		fmt.Println(err)
+		os.Exit(1)
+	}
+	for i := 0; i < 2; i++ {
+		go func() {
+			for {
+				f.Write([]byte("."))
+			}
+		}()
+	}
+	time.Sleep(10 * time.Millisecond)
+	fmt.Println("OK")
+}
+`
+
+func stackGrowthRecursive(i int) {
+	var pad [128]uint64
+	if i != 0 && pad[0] == 0 {
+		stackGrowthRecursive(i - 1)
+	}
+}
+
+func TestPreemptSplitBig(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping in -short mode")
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	stop := make(chan int)
+	go big(stop)
+	for i := 0; i < 3; i++ {
+		time.Sleep(10 * time.Microsecond) // let big start running
+		runtime.GC()
+	}
+	close(stop)
+}
+
+func big(stop chan int) int {
+	n := 0
+	for {
+		// delay so that gc is sure to have asked for a preemption
+		for i := 0; i < 1e9; i++ {
+			n++
+		}
+
+		// call bigframe, which used to miss the preemption in its prologue.
+		bigframe(stop)
+
+		// check if we've been asked to stop.
+		select {
+		case <-stop:
+			return n
+		}
+	}
+}
+
+func bigframe(stop chan int) int {
+	// not splitting the stack will overflow.
+	// small will notice that it needs a stack split and will
+	// catch the overflow.
+	var x [8192]byte
+	return small(stop, &x)
+}
+
+func small(stop chan int, x *[8192]byte) int {
+	for i := range x {
+		x[i] = byte(i)
+	}
+	sum := 0
+	for i := range x {
+		sum += int(x[i])
+	}
+
+	// keep small from being a leaf function, which might
+	// make it not do any stack check at all.
+	nonleaf(stop)
+
+	return sum
+}
+
+func nonleaf(stop chan int) bool {
+	// do something that won't be inlined:
+	select {
+	case <-stop:
+		return true
+	default:
+		return false
+	}
+}
+
+func TestSchedLocalQueue(t *testing.T) {
+	runtime.RunSchedLocalQueueTest()
+}
+
+func TestSchedLocalQueueSteal(t *testing.T) {
+	runtime.RunSchedLocalQueueStealTest()
+}
+
+func benchmarkStackGrowth(b *testing.B, rec int) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			stackGrowthRecursive(rec)
+		}
+	})
+}
+
+func BenchmarkStackGrowth(b *testing.B) {
+	benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+	benchmarkStackGrowth(b, 1024)
+}
+
+func BenchmarkCreateGoroutines(b *testing.B) {
+	benchmarkCreateGoroutines(b, 1)
+}
+
+func BenchmarkCreateGoroutinesParallel(b *testing.B) {
+	benchmarkCreateGoroutines(b, runtime.GOMAXPROCS(-1))
+}
+
+func benchmarkCreateGoroutines(b *testing.B, procs int) {
+	c := make(chan bool)
+	var f func(n int)
+	f = func(n int) {
+		if n == 0 {
+			c <- true
+			return
+		}
+		go f(n - 1)
+	}
+	for i := 0; i < procs; i++ {
+		go f(b.N / procs)
+	}
+	for i := 0; i < procs; i++ {
+		<-c
+	}
+}
+
+type Matrix [][]float64
+
+func BenchmarkMatmult(b *testing.B) {
+	b.StopTimer()
+	// matmult is O(N**3) but testing expects O(b.N),
+	// so we need to take cube root of b.N
+	n := int(math.Cbrt(float64(b.N))) + 1
+	A := makeMatrix(n)
+	B := makeMatrix(n)
+	C := makeMatrix(n)
+	b.StartTimer()
+	matmult(nil, A, B, C, 0, n, 0, n, 0, n, 8)
+}
+
+func makeMatrix(n int) Matrix {
+	m := make(Matrix, n)
+	for i := 0; i < n; i++ {
+		m[i] = make([]float64, n)
+		for j := 0; j < n; j++ {
+			m[i][j] = float64(i*n + j)
+		}
+	}
+	return m
+}
+
+func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, threshold int) {
+	di := i1 - i0
+	dj := j1 - j0
+	dk := k1 - k0
+	if di >= dj && di >= dk && di >= threshold {
+		// divide in two by y axis
+		mi := i0 + di/2
+		done1 := make(chan struct{}, 1)
+		go matmult(done1, A, B, C, i0, mi, j0, j1, k0, k1, threshold)
+		matmult(nil, A, B, C, mi, i1, j0, j1, k0, k1, threshold)
+		<-done1
+	} else if dj >= dk && dj >= threshold {
+		// divide in two by x axis
+		mj := j0 + dj/2
+		done1 := make(chan struct{}, 1)
+		go matmult(done1, A, B, C, i0, i1, j0, mj, k0, k1, threshold)
+		matmult(nil, A, B, C, i0, i1, mj, j1, k0, k1, threshold)
+		<-done1
+	} else if dk >= threshold {
+		// divide in two by "k" axis
+		// deliberately not parallel because of data races
+		mk := k0 + dk/2
+		matmult(nil, A, B, C, i0, i1, j0, j1, k0, mk, threshold)
+		matmult(nil, A, B, C, i0, i1, j0, j1, mk, k1, threshold)
+	} else {
+		// the matrices are small enough, compute directly
+		for i := i0; i < i1; i++ {
+			for j := j0; j < j1; j++ {
+				for k := k0; k < k1; k++ {
+					C[i][j] += A[i][k] * B[k][j]
+				}
+			}
+		}
+	}
+	if done != nil {
+		done <- struct{}{}
+	}
+}
diff --git a/src/runtime/race.c b/src/runtime/race.c
new file mode 100644
index 000000000..5b0d11664
--- /dev/null
+++ b/src/runtime/race.c
@@ -0,0 +1,347 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of the race detector API.
+// +build race
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "race.h"
+#include "type.h"
+#include "typekind.h"
+#include "textflag.h"
+
+// Race runtime functions called via runtime·racecall.
+void __tsan_init(void);
+void __tsan_fini(void);
+void __tsan_map_shadow(void);
+void __tsan_finalizer_goroutine(void);
+void __tsan_go_start(void);
+void __tsan_go_end(void);
+void __tsan_malloc(void);
+void __tsan_acquire(void);
+void __tsan_release(void);
+void __tsan_release_merge(void);
+void __tsan_go_ignore_sync_begin(void);
+void __tsan_go_ignore_sync_end(void);
+
+// Mimic what cmd/cgo would do.
+#pragma cgo_import_static __tsan_init
+#pragma cgo_import_static __tsan_fini
+#pragma cgo_import_static __tsan_map_shadow
+#pragma cgo_import_static __tsan_finalizer_goroutine
+#pragma cgo_import_static __tsan_go_start
+#pragma cgo_import_static __tsan_go_end
+#pragma cgo_import_static __tsan_malloc
+#pragma cgo_import_static __tsan_acquire
+#pragma cgo_import_static __tsan_release
+#pragma cgo_import_static __tsan_release_merge
+#pragma cgo_import_static __tsan_go_ignore_sync_begin
+#pragma cgo_import_static __tsan_go_ignore_sync_end
+
+// These are called from race_amd64.s.
+#pragma cgo_import_static __tsan_read
+#pragma cgo_import_static __tsan_read_pc
+#pragma cgo_import_static __tsan_read_range
+#pragma cgo_import_static __tsan_write
+#pragma cgo_import_static __tsan_write_pc
+#pragma cgo_import_static __tsan_write_range
+#pragma cgo_import_static __tsan_func_enter
+#pragma cgo_import_static __tsan_func_exit
+
+#pragma cgo_import_static __tsan_go_atomic32_load
+#pragma cgo_import_static __tsan_go_atomic64_load
+#pragma cgo_import_static __tsan_go_atomic32_store
+#pragma cgo_import_static __tsan_go_atomic64_store
+#pragma cgo_import_static __tsan_go_atomic32_exchange
+#pragma cgo_import_static __tsan_go_atomic64_exchange
+#pragma cgo_import_static __tsan_go_atomic32_fetch_add
+#pragma cgo_import_static __tsan_go_atomic64_fetch_add
+#pragma cgo_import_static __tsan_go_atomic32_compare_exchange
+#pragma cgo_import_static __tsan_go_atomic64_compare_exchange
+
+extern byte runtime·noptrdata[];
+extern byte runtime·enoptrdata[];
+extern byte runtime·data[];
+extern byte runtime·edata[];
+extern byte runtime·bss[];
+extern byte runtime·ebss[];
+extern byte runtime·noptrbss[];
+extern byte runtime·enoptrbss[];
+
+// start/end of global data (data+bss).
+uintptr runtime·racedatastart;
+uintptr runtime·racedataend;
+// start/end of heap for race_amd64.s
+uintptr runtime·racearenastart;
+uintptr runtime·racearenaend;
+
+void runtime·racefuncenter(void *callpc);
+void runtime·racefuncexit(void);
+void runtime·racereadrangepc1(void *addr, uintptr sz, void *pc);
+void runtime·racewriterangepc1(void *addr, uintptr sz, void *pc);
+void runtime·racesymbolizethunk(void*);
+
+// racecall allows calling an arbitrary function f from C race runtime
+// with up to 4 uintptr arguments.
+void runtime·racecall(void(*f)(void), ...);
+
+// checks if the address has shadow (i.e. heap or data/bss)
+#pragma textflag NOSPLIT
+static bool
+isvalidaddr(uintptr addr)
+{
+	if(addr >= runtime·racearenastart && addr < runtime·racearenaend)
+		return true;
+	if(addr >= runtime·racedatastart && addr < runtime·racedataend)
+		return true;
+	return false;
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·raceinit(void)
+{
+	uintptr racectx, start, end, size;
+
+	// cgo is required to initialize libc, which is used by race runtime
+	if(!runtime·iscgo)
+		runtime·throw("raceinit: race build must use cgo");
+	runtime·racecall(__tsan_init, &racectx, runtime·racesymbolizethunk);
+	// Round data segment to page boundaries, because it's used in mmap().
+	// The relevant sections are noptrdata, data, bss, noptrbss.
+	// In external linking mode, there may be other non-Go data mixed in,
+	// and the sections may even occur out of order.
+	// Work out a conservative range of addresses.
+	start = ~(uintptr)0;
+	end = 0;
+	if(start > (uintptr)runtime·noptrdata)
+		start = (uintptr)runtime·noptrdata;
+	if(start > (uintptr)runtime·data)
+		start = (uintptr)runtime·data;
+	if(start > (uintptr)runtime·noptrbss)
+		start = (uintptr)runtime·noptrbss;
+	if(start > (uintptr)runtime·bss)
+		start = (uintptr)runtime·bss;
+	if(end < (uintptr)runtime·enoptrdata)
+		end = (uintptr)runtime·enoptrdata;
+	if(end < (uintptr)runtime·edata)
+		end = (uintptr)runtime·edata;
+	if(end < (uintptr)runtime·enoptrbss)
+		end = (uintptr)runtime·enoptrbss;
+	if(end < (uintptr)runtime·ebss)
+		end = (uintptr)runtime·ebss;
+	start = start & ~(PageSize-1);
+	size = ROUND(end - start, PageSize);
+	runtime·racecall(__tsan_map_shadow, start, size);
+	runtime·racedatastart = start;
+	runtime·racedataend = start + size;
+	return racectx;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racefini(void)
+{
+	runtime·racecall(__tsan_fini);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racemapshadow(void *addr, uintptr size)
+{
+	if(runtime·racearenastart == 0)
+		runtime·racearenastart = (uintptr)addr;
+	if(runtime·racearenaend < (uintptr)addr+size)
+		runtime·racearenaend = (uintptr)addr+size;
+	runtime·racecall(__tsan_map_shadow, addr, size);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racemalloc(void *p, uintptr sz)
+{
+	runtime·racecall(__tsan_malloc, p, sz);
+}
+
+#pragma textflag NOSPLIT
+uintptr
+runtime·racegostart(void *pc)
+{
+	uintptr racectx;
+	G *spawng;
+
+	if(g->m->curg != nil)
+		spawng = g->m->curg;
+	else
+		spawng = g;
+
+	runtime·racecall(__tsan_go_start, spawng->racectx, &racectx, pc);
+	return racectx;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racegoend(void)
+{
+	runtime·racecall(__tsan_go_end, g->racectx);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racewriterangepc(void *addr, uintptr sz, void *callpc, void *pc)
+{
+	if(g != g->m->curg) {
+		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
+		// Not interesting.
+		return;
+	}
+	if(callpc != nil)
+		runtime·racefuncenter(callpc);
+	runtime·racewriterangepc1(addr, sz, pc);
+	if(callpc != nil)
+		runtime·racefuncexit();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racereadrangepc(void *addr, uintptr sz, void *callpc, void *pc)
+{
+	if(g != g->m->curg) {
+		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
+		// Not interesting.
+		return;
+	}
+	if(callpc != nil)
+		runtime·racefuncenter(callpc);
+	runtime·racereadrangepc1(addr, sz, pc);
+	if(callpc != nil)
+		runtime·racefuncexit();
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racewriteobjectpc(void *addr, Type *t, void *callpc, void *pc)
+{
+	uint8 kind;
+
+	kind = t->kind & KindMask;
+	if(kind == KindArray || kind == KindStruct)
+		runtime·racewriterangepc(addr, t->size, callpc, pc);
+	else
+		runtime·racewritepc(addr, callpc, pc);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racereadobjectpc(void *addr, Type *t, void *callpc, void *pc)
+{
+	uint8 kind;
+
+	kind = t->kind & KindMask;
+	if(kind == KindArray || kind == KindStruct)
+		runtime·racereadrangepc(addr, t->size, callpc, pc);
+	else
+		runtime·racereadpc(addr, callpc, pc);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·raceacquire(void *addr)
+{
+	runtime·raceacquireg(g, addr);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·raceacquireg(G *gp, void *addr)
+{
+	if(g->raceignore || !isvalidaddr((uintptr)addr))
+		return;
+	runtime·racecall(__tsan_acquire, gp->racectx, addr);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racerelease(void *addr)
+{
+	if(g->raceignore || !isvalidaddr((uintptr)addr))
+		return;
+	runtime·racereleaseg(g, addr);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racereleaseg(G *gp, void *addr)
+{
+	if(g->raceignore || !isvalidaddr((uintptr)addr))
+		return;
+	runtime·racecall(__tsan_release, gp->racectx, addr);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racereleasemerge(void *addr)
+{
+	runtime·racereleasemergeg(g, addr);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racereleasemergeg(G *gp, void *addr)
+{
+	if(g->raceignore || !isvalidaddr((uintptr)addr))
+		return;
+	runtime·racecall(__tsan_release_merge, gp->racectx, addr);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·racefingo(void)
+{
+	runtime·racecall(__tsan_finalizer_goroutine, g->racectx);
+}
+
+// func RaceAcquire(addr unsafe.Pointer)
+#pragma textflag NOSPLIT
+void
+runtime·RaceAcquire(void *addr)
+{
+	runtime·raceacquire(addr);
+}
+
+// func RaceRelease(addr unsafe.Pointer)
+#pragma textflag NOSPLIT
+void
+runtime·RaceRelease(void *addr)
+{
+	runtime·racerelease(addr);
+}
+
+// func RaceReleaseMerge(addr unsafe.Pointer)
+#pragma textflag NOSPLIT
+void
+runtime·RaceReleaseMerge(void *addr)
+{
+	runtime·racereleasemerge(addr);
+}
+
+// func RaceDisable()
+#pragma textflag NOSPLIT
+void
+runtime·RaceDisable(void)
+{
+	if(g->raceignore++ == 0)
+		runtime·racecall(__tsan_go_ignore_sync_begin, g->racectx);
+}
+
+// func RaceEnable()
+#pragma textflag NOSPLIT
+void
+runtime·RaceEnable(void)
+{
+	if(--g->raceignore == 0)
+		runtime·racecall(__tsan_go_ignore_sync_end, g->racectx);
+}
diff --git a/src/runtime/race.go b/src/runtime/race.go
new file mode 100644
index 000000000..bb0ee6df6
--- /dev/null
+++ b/src/runtime/race.go
@@ -0,0 +1,127 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+// Public race detection API, present iff build with -race.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+func racefini()
+
+// RaceDisable disables handling of race events in the current goroutine.
+func RaceDisable()
+
+// RaceEnable re-enables handling of race events in the current goroutine.
+func RaceEnable()
+
+func RaceAcquire(addr unsafe.Pointer)
+func RaceRelease(addr unsafe.Pointer)
+func RaceReleaseMerge(addr unsafe.Pointer)
+
+func RaceRead(addr unsafe.Pointer)
+func RaceWrite(addr unsafe.Pointer)
+func RaceReadRange(addr unsafe.Pointer, len int)
+func RaceWriteRange(addr unsafe.Pointer, len int)
+
+func RaceSemacquire(s *uint32)
+func RaceSemrelease(s *uint32)
+
+// private interface for the runtime
+const raceenabled = true
+
+func raceReadObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) {
+	kind := t.kind & kindMask
+	if kind == kindArray || kind == kindStruct {
+		// for composite objects we have to read every address
+		// because a write might happen to any subobject.
+		racereadrangepc(addr, t.size, callerpc, pc)
+	} else {
+		// for non-composite objects we can read just the start
+		// address, as any write must write the first byte.
+		racereadpc(addr, callerpc, pc)
+	}
+}
+
+func raceWriteObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) {
+	kind := t.kind & kindMask
+	if kind == kindArray || kind == kindStruct {
+		// for composite objects we have to write every address
+		// because a write might happen to any subobject.
+		racewriterangepc(addr, t.size, callerpc, pc)
+	} else {
+		// for non-composite objects we can write just the start
+		// address, as any write must write the first byte.
+		racewritepc(addr, callerpc, pc)
+	}
+}
+
+//go:noescape
+func racereadpc(addr unsafe.Pointer, callpc, pc uintptr)
+
+//go:noescape
+func racewritepc(addr unsafe.Pointer, callpc, pc uintptr)
+
+//go:noescape
+func racereadrangepc(addr unsafe.Pointer, len uintptr, callpc, pc uintptr)
+
+//go:noescape
+func racewriterangepc(addr unsafe.Pointer, len uintptr, callpc, pc uintptr)
+
+//go:noescape
+func raceacquire(addr unsafe.Pointer)
+
+//go:noescape
+func racerelease(addr unsafe.Pointer)
+
+//go:noescape
+func raceacquireg(gp *g, addr unsafe.Pointer)
+
+//go:noescape
+func racereleaseg(gp *g, addr unsafe.Pointer)
+
+func racefingo()
+
+//go:noescape
+func racemalloc(p unsafe.Pointer, size uintptr)
+
+//go:noescape
+func racereleasemerge(addr unsafe.Pointer)
+
+type symbolizeContext struct {
+	pc   uintptr
+	fn   *byte
+	file *byte
+	line uintptr
+	off  uintptr
+	res  uintptr
+}
+
+var qq = [...]byte{'?', '?', 0}
+var dash = [...]byte{'-', 0}
+
+// Callback from C into Go, runs on g0.
+func racesymbolize(ctx *symbolizeContext) {
+	f := findfunc(ctx.pc)
+	if f == nil {
+		ctx.fn = &qq[0]
+		ctx.file = &dash[0]
+		ctx.line = 0
+		ctx.off = ctx.pc
+		ctx.res = 1
+		return
+	}
+
+	ctx.fn = funcname(f)
+	var file string
+	ctx.line = uintptr(funcline(f, ctx.pc, &file))
+	ctx.file = &bytes(file)[0] // assume NUL-terminated
+	ctx.off = ctx.pc - f.entry
+	ctx.res = 1
+	return
+}
diff --git a/src/runtime/race.h b/src/runtime/race.h
new file mode 100644
index 000000000..fee31e09f
--- /dev/null
+++ b/src/runtime/race.h
@@ -0,0 +1,34 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Definitions related to data race detection.
+
+#ifdef RACE
+enum { raceenabled = 1 };
+#else
+enum { raceenabled = 0 };
+#endif
+
+// Initialize race detection subsystem.
+uintptr	runtime·raceinit(void);
+// Finalize race detection subsystem, does not return.
+void	runtime·racefini(void);
+
+void	runtime·racemapshadow(void *addr, uintptr size);
+void	runtime·racemalloc(void *p, uintptr sz);
+uintptr	runtime·racegostart(void *pc);
+void	runtime·racegoend(void);
+void	runtime·racewritepc(void *addr, void *callpc, void *pc);
+void	runtime·racereadpc(void *addr, void *callpc, void *pc);
+void	runtime·racewriterangepc(void *addr, uintptr sz, void *callpc, void *pc);
+void	runtime·racereadrangepc(void *addr, uintptr sz, void *callpc, void *pc);
+void	runtime·racereadobjectpc(void *addr, Type *t, void *callpc, void *pc);
+void	runtime·racewriteobjectpc(void *addr, Type *t, void *callpc, void *pc);
+void	runtime·racefingo(void);
+void	runtime·raceacquire(void *addr);
+void	runtime·raceacquireg(G *gp, void *addr);
+void	runtime·racerelease(void *addr);
+void	runtime·racereleaseg(G *gp, void *addr);
+void	runtime·racereleasemerge(void *addr);
+void	runtime·racereleasemergeg(G *gp, void *addr);
diff --git a/src/runtime/race/README b/src/runtime/race/README
new file mode 100644
index 000000000..7f185359f
--- /dev/null
+++ b/src/runtime/race/README
@@ -0,0 +1,12 @@
+runtime/race package contains the data race detector runtime library.
+It is based on ThreadSanitizer race detector, that is currently a part of
+the LLVM project.
+
+To update the .syso files you need to:
+$ svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk
+$ cd compiler-rt/lib/tsan/go
+$ ./buildgo.sh
+
+Tested with gcc 4.6.1 and 4.7.0.  On Windows it's built with 64-bit MinGW.
+
+Current runtime is built on rev 215000.
diff --git a/src/runtime/race/doc.go b/src/runtime/race/doc.go
new file mode 100644
index 000000000..aef805dad
--- /dev/null
+++ b/src/runtime/race/doc.go
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package race implements data race detection logic.
+// No public interface is provided.
+// For details about the race detector see
+// http://golang.org/doc/articles/race_detector.html
+package race
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
new file mode 100644
index 000000000..d2303f7af
--- /dev/null
+++ b/src/runtime/race/output_test.go
@@ -0,0 +1,156 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package race_test
+
+import (
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"testing"
+)
+
+func TestOutput(t *testing.T) {
+	for _, test := range tests {
+		dir, err := ioutil.TempDir("", "go-build")
+		if err != nil {
+			t.Fatalf("failed to create temp directory: %v", err)
+		}
+		defer os.RemoveAll(dir)
+		src := filepath.Join(dir, "main.go")
+		f, err := os.Create(src)
+		if err != nil {
+			t.Fatalf("failed to create file: %v", err)
+		}
+		_, err = f.WriteString(test.source)
+		if err != nil {
+			f.Close()
+			t.Fatalf("failed to write: %v", err)
+		}
+		if err := f.Close(); err != nil {
+			t.Fatalf("failed to close file: %v", err)
+		}
+		// Pass -l to the compiler to test stack traces.
+		cmd := exec.Command("go", "run", "-race", "-gcflags=-l", src)
+		// GODEBUG spoils program output, GOMAXPROCS makes it flaky.
+		for _, env := range os.Environ() {
+			if strings.HasPrefix(env, "GODEBUG=") ||
+				strings.HasPrefix(env, "GOMAXPROCS=") ||
+				strings.HasPrefix(env, "GORACE=") {
+				continue
+			}
+			cmd.Env = append(cmd.Env, env)
+		}
+		cmd.Env = append(cmd.Env, "GORACE="+test.gorace)
+		got, _ := cmd.CombinedOutput()
+		if !regexp.MustCompile(test.re).MatchString(string(got)) {
+			t.Fatalf("failed test case %v, expect:\n%v\ngot:\n%s",
+				test.name, test.re, got)
+		}
+	}
+}
+
+var tests = []struct {
+	name   string
+	gorace string
+	source string
+	re     string
+}{
+	{"simple", "atexit_sleep_ms=0", `
+package main
+import "time"
+func main() {
+	done := make(chan bool)
+	x := 0
+	startRacer(&x, done)
+	store(&x, 43)
+	<-done
+}
+func store(x *int, v int) {
+	*x = v
+}
+func startRacer(x *int, done chan bool) {
+	go racer(x, done)
+}
+func racer(x *int, done chan bool) {
+	time.Sleep(10*time.Millisecond)
+	store(x, 42)
+	done <- true
+}
+`, `==================
+WARNING: DATA RACE
+Write by goroutine [0-9]:
+  main\.store\(\)
+      .+/main\.go:12 \+0x[0-9,a-f]+
+  main\.racer\(\)
+      .+/main\.go:19 \+0x[0-9,a-f]+
+
+Previous write by main goroutine:
+  main\.store\(\)
+      .+/main\.go:12 \+0x[0-9,a-f]+
+  main\.main\(\)
+      .+/main\.go:8 \+0x[0-9,a-f]+
+
+Goroutine [0-9] \(running\) created at:
+  main\.startRacer\(\)
+      .+/main\.go:15 \+0x[0-9,a-f]+
+  main\.main\(\)
+      .+/main\.go:7 \+0x[0-9,a-f]+
+==================
+Found 1 data race\(s\)
+exit status 66
+`},
+
+	{"exitcode", "atexit_sleep_ms=0 exitcode=13", `
+package main
+func main() {
+	done := make(chan bool)
+	x := 0
+	go func() {
+		x = 42
+		done <- true
+	}()
+	x = 43
+	<-done
+}
+`, `exit status 13`},
+
+	{"strip_path_prefix", "atexit_sleep_ms=0 strip_path_prefix=/main.", `
+package main
+func main() {
+	done := make(chan bool)
+	x := 0
+	go func() {
+		x = 42
+		done <- true
+	}()
+	x = 43
+	<-done
+}
+`, `
+      go:7 \+0x[0-9,a-f]+
+`},
+
+	{"halt_on_error", "atexit_sleep_ms=0 halt_on_error=1", `
+package main
+func main() {
+	done := make(chan bool)
+	x := 0
+	go func() {
+		x = 42
+		done <- true
+	}()
+	x = 43
+	<-done
+}
+`, `
+==================
+exit status 66
+`},
+}
diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
new file mode 100644
index 000000000..31deedd73
--- /dev/null
+++ b/src/runtime/race/race.go
@@ -0,0 +1,15 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race,linux,amd64 race,freebsd,amd64 race,darwin,amd64 race,windows,amd64
+
+package race
+
+// This file merely ensures that we link in runtime/cgo in race build,
+// this is turn ensures that runtime uses pthread_create to create threads.
+// The prebuilt race runtime lives in race_GOOS_GOARCH.syso.
+// Calls to the runtime are done directly from src/runtime/race.c.
+
+// void __race_unused_func(void);
+import "C"
diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/race_darwin_amd64.syso
new file mode 100644
index 000000000..81b48c6c9
--- /dev/null
+++ b/src/runtime/race/race_darwin_amd64.syso
diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/race_freebsd_amd64.syso
new file mode 100644
index 000000000..5bbe32229
--- /dev/null
+++ b/src/runtime/race/race_freebsd_amd64.syso
diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
new file mode 100644
index 000000000..49bf08ef3
--- /dev/null
+++ b/src/runtime/race/race_linux_amd64.syso
diff --git a/src/runtime/race/race_test.go b/src/runtime/race/race_test.go
new file mode 100644
index 000000000..7e0ee866a
--- /dev/null
+++ b/src/runtime/race/race_test.go
@@ -0,0 +1,172 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+// This program is used to verify the race detector
+// by running the tests and parsing their output.
+// It does not check stack correctness, completeness or anything else:
+// it merely verifies that if a test is expected to be racy
+// then the race is detected.
+package race_test
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+var (
+	passedTests = 0
+	totalTests  = 0
+	falsePos    = 0
+	falseNeg    = 0
+	failingPos  = 0
+	failingNeg  = 0
+	failed      = false
+)
+
+const (
+	visibleLen = 40
+	testPrefix = "=== RUN Test"
+)
+
+func TestRace(t *testing.T) {
+	testOutput, err := runTests()
+	if err != nil {
+		t.Fatalf("Failed to run tests: %v\n%v", err, string(testOutput))
+	}
+	reader := bufio.NewReader(bytes.NewReader(testOutput))
+
+	funcName := ""
+	var tsanLog []string
+	for {
+		s, err := nextLine(reader)
+		if err != nil {
+			fmt.Printf("%s\n", processLog(funcName, tsanLog))
+			break
+		}
+		if strings.HasPrefix(s, testPrefix) {
+			fmt.Printf("%s\n", processLog(funcName, tsanLog))
+			tsanLog = make([]string, 0, 100)
+			funcName = s[len(testPrefix):]
+		} else {
+			tsanLog = append(tsanLog, s)
+		}
+	}
+
+	fmt.Printf("\nPassed %d of %d tests (%.02f%%, %d+, %d-)\n",
+		passedTests, totalTests, 100*float64(passedTests)/float64(totalTests), falsePos, falseNeg)
+	fmt.Printf("%d expected failures (%d has not fail)\n", failingPos+failingNeg, failingNeg)
+	if failed {
+		t.Fail()
+	}
+}
+
+// nextLine is a wrapper around bufio.Reader.ReadString.
+// It reads a line up to the next '\n' character. Error
+// is non-nil if there are no lines left, and nil
+// otherwise.
+func nextLine(r *bufio.Reader) (string, error) {
+	s, err := r.ReadString('\n')
+	if err != nil {
+		if err != io.EOF {
+			log.Fatalf("nextLine: expected EOF, received %v", err)
+		}
+		return s, err
+	}
+	return s[:len(s)-1], nil
+}
+
+// processLog verifies whether the given ThreadSanitizer's log
+// contains a race report, checks this information against
+// the name of the testcase and returns the result of this
+// comparison.
+func processLog(testName string, tsanLog []string) string {
+	if !strings.HasPrefix(testName, "Race") && !strings.HasPrefix(testName, "NoRace") {
+		return ""
+	}
+	gotRace := false
+	for _, s := range tsanLog {
+		if strings.Contains(s, "DATA RACE") {
+			gotRace = true
+			break
+		}
+	}
+
+	failing := strings.Contains(testName, "Failing")
+	expRace := !strings.HasPrefix(testName, "No")
+	for len(testName) < visibleLen {
+		testName += " "
+	}
+	if expRace == gotRace {
+		passedTests++
+		totalTests++
+		if failing {
+			failed = true
+			failingNeg++
+		}
+		return fmt.Sprintf("%s .", testName)
+	}
+	pos := ""
+	if expRace {
+		falseNeg++
+	} else {
+		falsePos++
+		pos = "+"
+	}
+	if failing {
+		failingPos++
+	} else {
+		failed = true
+	}
+	totalTests++
+	return fmt.Sprintf("%s %s%s", testName, "FAILED", pos)
+}
+
+// runTests assures that the package and its dependencies is
+// built with instrumentation enabled and returns the output of 'go test'
+// which includes possible data race reports from ThreadSanitizer.
+func runTests() ([]byte, error) {
+	tests, err := filepath.Glob("./testdata/*_test.go")
+	if err != nil {
+		return nil, err
+	}
+	args := []string{"test", "-race", "-v"}
+	args = append(args, tests...)
+	cmd := exec.Command("go", args...)
+	// The following flags turn off heuristics that suppress seemingly identical reports.
+	// It is required because the tests contain a lot of data races on the same addresses
+	// (the tests are simple and the memory is constantly reused).
+	for _, env := range os.Environ() {
+		if strings.HasPrefix(env, "GOMAXPROCS=") || strings.HasPrefix(env, "GODEBUG=") {
+			continue
+		}
+		cmd.Env = append(cmd.Env, env)
+	}
+	cmd.Env = append(cmd.Env, `GORACE="suppress_equal_stacks=0 suppress_equal_addresses=0 exitcode=0"`)
+	return cmd.CombinedOutput()
+}
+
+func TestIssue8102(t *testing.T) {
+	// If this compiles with -race, the test passes.
+	type S struct {
+		x interface{}
+		i int
+	}
+	c := make(chan int)
+	a := [2]*int{}
+	for ; ; c <- *a[S{}.i] {
+		if t != nil {
+			break
+		}
+	}
+}
diff --git a/src/runtime/race/race_unix_test.go b/src/runtime/race/race_unix_test.go
new file mode 100644
index 000000000..84f0acece
--- /dev/null
+++ b/src/runtime/race/race_unix_test.go
@@ -0,0 +1,30 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+// +build darwin freebsd linux
+
+package race_test
+
+import (
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+// Test that race detector does not crash when accessing non-Go allocated memory (issue 9136).
+func TestNonGoMemory(t *testing.T) {
+	data, err := syscall.Mmap(-1, 0, 4096, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANON|syscall.MAP_PRIVATE)
+	if err != nil {
+		t.Fatalf("failed to mmap memory: %v", err)
+	}
+	p := (*uint32)(unsafe.Pointer(&data[0]))
+	atomic.AddUint32(p, 1)
+	(*p)++
+	if *p != 2 {
+		t.Fatalf("data[0] = %v, expect 2", *p)
+	}
+	syscall.Munmap(data)
+}
diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/race_windows_amd64.syso
new file mode 100644
index 000000000..a4eae9bdd
--- /dev/null
+++ b/src/runtime/race/race_windows_amd64.syso
diff --git a/src/runtime/race/testdata/atomic_test.go b/src/runtime/race/testdata/atomic_test.go
new file mode 100644
index 000000000..232744b3d
--- /dev/null
+++ b/src/runtime/race/testdata/atomic_test.go
@@ -0,0 +1,288 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"unsafe"
+)
+
+func TestNoRaceAtomicAddInt64(t *testing.T) {
+	var x1, x2 int8
+	var s int64
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if atomic.AddInt64(&s, 1) == 2 {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if atomic.AddInt64(&s, 1) == 2 {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceAtomicAddInt64(t *testing.T) {
+	var x1, x2 int8
+	var s int64
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if atomic.AddInt64(&s, 1) == 1 {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if atomic.AddInt64(&s, 1) == 1 {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceAtomicAddInt32(t *testing.T) {
+	var x1, x2 int8
+	var s int32
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if atomic.AddInt32(&s, 1) == 2 {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if atomic.AddInt32(&s, 1) == 2 {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceAtomicLoadAddInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		atomic.AddInt32(&s, 1)
+	}()
+	for atomic.LoadInt32(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicLoadStoreInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		atomic.StoreInt32(&s, 1)
+	}()
+	for atomic.LoadInt32(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicStoreCASInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		atomic.StoreInt32(&s, 1)
+	}()
+	for !atomic.CompareAndSwapInt32(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASLoadInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			panic("")
+		}
+	}()
+	for atomic.LoadInt32(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASCASInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			panic("")
+		}
+	}()
+	for !atomic.CompareAndSwapInt32(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASCASInt32_2(t *testing.T) {
+	var x1, x2 int8
+	var s int32
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceAtomicLoadInt64(t *testing.T) {
+	var x int32
+	var s int64
+	go func() {
+		x = 2
+		atomic.AddInt64(&s, 1)
+	}()
+	for atomic.LoadInt64(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASCASUInt64(t *testing.T) {
+	var x int64
+	var s uint64
+	go func() {
+		x = 2
+		if !atomic.CompareAndSwapUint64(&s, 0, 1) {
+			panic("")
+		}
+	}()
+	for !atomic.CompareAndSwapUint64(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicLoadStorePointer(t *testing.T) {
+	var x int64
+	var s unsafe.Pointer
+	var y int = 2
+	var p unsafe.Pointer = unsafe.Pointer(&y)
+	go func() {
+		x = 2
+		atomic.StorePointer(&s, p)
+	}()
+	for atomic.LoadPointer(&s) != p {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicStoreCASUint64(t *testing.T) {
+	var x int64
+	var s uint64
+	go func() {
+		x = 2
+		atomic.StoreUint64(&s, 1)
+	}()
+	for !atomic.CompareAndSwapUint64(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestRaceAtomicStoreLoad(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		atomic.StoreUint64(&a, 1)
+		c <- true
+	}()
+	_ = a
+	<-c
+}
+
+func TestRaceAtomicLoadStore(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		_ = atomic.LoadUint64(&a)
+		c <- true
+	}()
+	a = 1
+	<-c
+}
+
+func TestRaceAtomicAddLoad(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		atomic.AddUint64(&a, 1)
+		c <- true
+	}()
+	_ = a
+	<-c
+}
+
+func TestRaceAtomicAddStore(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		atomic.AddUint64(&a, 1)
+		c <- true
+	}()
+	a = 42
+	<-c
+}
+
+// A nil pointer in an atomic operation should not deadlock
+// the rest of the program. Used to hang indefinitely.
+func TestNoRaceAtomicCrash(t *testing.T) {
+	var mutex sync.Mutex
+	var nilptr *int32
+	panics := 0
+	defer func() {
+		if x := recover(); x != nil {
+			mutex.Lock()
+			panics++
+			mutex.Unlock()
+		} else {
+			panic("no panic")
+		}
+	}()
+	atomic.AddInt32(nilptr, 1)
+}
diff --git a/src/runtime/race/testdata/cgo_test.go b/src/runtime/race/testdata/cgo_test.go
new file mode 100644
index 000000000..ba7e7b562
--- /dev/null
+++ b/src/runtime/race/testdata/cgo_test.go
@@ -0,0 +1,20 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"os"
+	"os/exec"
+	"testing"
+)
+
+func TestNoRaceCgoSync(t *testing.T) {
+	cmd := exec.Command("go", "run", "-race", "cgo_test_main.go")
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		t.Fatalf("program exited with error: %v\n", err)
+	}
+}
diff --git a/src/runtime/race/testdata/cgo_test_main.go b/src/runtime/race/testdata/cgo_test_main.go
new file mode 100644
index 000000000..620cea18b
--- /dev/null
+++ b/src/runtime/race/testdata/cgo_test_main.go
@@ -0,0 +1,30 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+/*
+int sync;
+
+void Notify(void)
+{
+	__sync_fetch_and_add(&sync, 1);
+}
+
+void Wait(void)
+{
+	while(__sync_fetch_and_add(&sync, 0) == 0) {}
+}
+*/
+import "C"
+
+func main() {
+	data := 0
+	go func() {
+		data = 1
+		C.Notify()
+	}()
+	C.Wait()
+	_ = data
+}
diff --git a/src/runtime/race/testdata/chan_test.go b/src/runtime/race/testdata/chan_test.go
new file mode 100644
index 000000000..eabd81f40
--- /dev/null
+++ b/src/runtime/race/testdata/chan_test.go
@@ -0,0 +1,659 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"testing"
+	"time"
+)
+
+func TestNoRaceChanSync(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		c <- 0
+	}()
+	<-c
+	v = 2
+}
+
+func TestNoRaceChanSyncRev(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		c <- 0
+		v = 2
+	}()
+	v = 1
+	<-c
+}
+
+func TestNoRaceChanAsync(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		c <- 0
+	}()
+	<-c
+	v = 2
+}
+
+func TestRaceChanAsyncRev(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		c <- 0
+		v = 1
+	}()
+	v = 2
+	<-c
+}
+
+func TestNoRaceChanAsyncCloseRecv(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+			v = 2
+		}()
+		<-c
+	}()
+}
+
+func TestNoRaceChanAsyncCloseRecv2(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	_, _ = <-c
+	v = 2
+}
+
+func TestNoRaceChanAsyncCloseRecv3(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	for range c {
+	}
+	v = 2
+}
+
+func TestNoRaceChanSyncCloseRecv(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+			v = 2
+		}()
+		<-c
+	}()
+}
+
+func TestNoRaceChanSyncCloseRecv2(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	_, _ = <-c
+	v = 2
+}
+
+func TestNoRaceChanSyncCloseRecv3(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	for range c {
+	}
+	v = 2
+}
+
+func TestRaceChanSyncCloseSend(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+		}()
+		c <- 0
+	}()
+	v = 2
+}
+
+func TestRaceChanAsyncCloseSend(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+		}()
+		for {
+			c <- 0
+		}
+	}()
+	v = 2
+}
+
+func TestRaceChanCloseClose(t *testing.T) {
+	compl := make(chan bool, 2)
+	v1 := 0
+	v2 := 0
+	c := make(chan int)
+	go func() {
+		defer func() {
+			if recover() != nil {
+				v2 = 2
+			}
+			compl <- true
+		}()
+		v1 = 1
+		close(c)
+	}()
+	go func() {
+		defer func() {
+			if recover() != nil {
+				v1 = 2
+			}
+			compl <- true
+		}()
+		v2 = 1
+		close(c)
+	}()
+	<-compl
+	<-compl
+}
+
+func TestRaceChanSendLen(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		c <- 1
+	}()
+	for len(c) == 0 {
+		runtime.Gosched()
+	}
+	v = 2
+}
+
+func TestRaceChanRecvLen(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	c <- 1
+	go func() {
+		v = 1
+		<-c
+	}()
+	for len(c) != 0 {
+		runtime.Gosched()
+	}
+	v = 2
+}
+
+func TestRaceChanSendSend(t *testing.T) {
+	compl := make(chan bool, 2)
+	v1 := 0
+	v2 := 0
+	c := make(chan int, 1)
+	go func() {
+		v1 = 1
+		select {
+		case c <- 1:
+		default:
+			v2 = 2
+		}
+		compl <- true
+	}()
+	go func() {
+		v2 = 1
+		select {
+		case c <- 1:
+		default:
+			v1 = 2
+		}
+		compl <- true
+	}()
+	<-compl
+	<-compl
+}
+
+func TestNoRaceChanPtr(t *testing.T) {
+	type msg struct {
+		x int
+	}
+	c := make(chan *msg)
+	go func() {
+		c <- &msg{1}
+	}()
+	m := <-c
+	m.x = 2
+}
+
+func TestRaceChanWrongSend(t *testing.T) {
+	v1 := 0
+	v2 := 0
+	c := make(chan int, 2)
+	go func() {
+		v1 = 1
+		c <- 1
+	}()
+	go func() {
+		v2 = 2
+		c <- 2
+	}()
+	time.Sleep(1e7)
+	if <-c == 1 {
+		v2 = 3
+	} else {
+		v1 = 3
+	}
+}
+
+func TestRaceChanWrongClose(t *testing.T) {
+	v1 := 0
+	v2 := 0
+	c := make(chan int, 1)
+	go func() {
+		defer func() {
+			recover()
+		}()
+		v1 = 1
+		c <- 1
+	}()
+	go func() {
+		time.Sleep(1e7)
+		v2 = 2
+		close(c)
+	}()
+	time.Sleep(2e7)
+	if _, who := <-c; who {
+		v2 = 2
+	} else {
+		v1 = 2
+	}
+}
+
+func TestRaceChanSendClose(t *testing.T) {
+	compl := make(chan bool, 2)
+	c := make(chan int, 1)
+	go func() {
+		defer func() {
+			recover()
+			compl <- true
+		}()
+		c <- 1
+	}()
+	go func() {
+		time.Sleep(10 * time.Millisecond)
+		close(c)
+		compl <- true
+	}()
+	<-compl
+	<-compl
+}
+
+func TestRaceChanSendSelectClose(t *testing.T) {
+	compl := make(chan bool, 2)
+	c := make(chan int, 1)
+	c1 := make(chan int)
+	go func() {
+		defer func() {
+			recover()
+			compl <- true
+		}()
+		time.Sleep(10 * time.Millisecond)
+		select {
+		case c <- 1:
+		case <-c1:
+		}
+	}()
+	go func() {
+		close(c)
+		compl <- true
+	}()
+	<-compl
+	<-compl
+}
+
+func TestRaceSelectReadWriteAsync(t *testing.T) {
+	done := make(chan bool)
+	x := 0
+	c1 := make(chan int, 10)
+	c2 := make(chan int, 10)
+	c3 := make(chan int)
+	c2 <- 1
+	go func() {
+		select {
+		case c1 <- x: // read of x races with...
+		case c3 <- 1:
+		}
+		done <- true
+	}()
+	select {
+	case x = <-c2: // ... write to x here
+	case c3 <- 1:
+	}
+	<-done
+}
+
+func TestRaceSelectReadWriteSync(t *testing.T) {
+	done := make(chan bool)
+	x := 0
+	c1 := make(chan int)
+	c2 := make(chan int)
+	c3 := make(chan int)
+	// make c1 and c2 ready for communication
+	go func() {
+		<-c1
+	}()
+	go func() {
+		c2 <- 1
+	}()
+	go func() {
+		select {
+		case c1 <- x: // read of x races with...
+		case c3 <- 1:
+		}
+		done <- true
+	}()
+	select {
+	case x = <-c2: // ... write to x here
+	case c3 <- 1:
+	}
+	<-done
+}
+
+func TestNoRaceSelectReadWriteAsync(t *testing.T) {
+	done := make(chan bool)
+	x := 0
+	c1 := make(chan int)
+	c2 := make(chan int)
+	go func() {
+		select {
+		case c1 <- x: // read of x does not race with...
+		case c2 <- 1:
+		}
+		done <- true
+	}()
+	select {
+	case x = <-c1: // ... write to x here
+	case c2 <- 1:
+	}
+	<-done
+}
+
+func TestRaceChanReadWriteAsync(t *testing.T) {
+	done := make(chan bool)
+	c1 := make(chan int, 10)
+	c2 := make(chan int, 10)
+	c2 <- 10
+	x := 0
+	go func() {
+		c1 <- x // read of x races with...
+		done <- true
+	}()
+	x = <-c2 // ... write to x here
+	<-done
+}
+
+func TestRaceChanReadWriteSync(t *testing.T) {
+	done := make(chan bool)
+	c1 := make(chan int)
+	c2 := make(chan int)
+	// make c1 and c2 ready for communication
+	go func() {
+		<-c1
+	}()
+	go func() {
+		c2 <- 10
+	}()
+	x := 0
+	go func() {
+		c1 <- x // read of x races with...
+		done <- true
+	}()
+	x = <-c2 // ... write to x here
+	<-done
+}
+
+func TestNoRaceChanReadWriteAsync(t *testing.T) {
+	done := make(chan bool)
+	c1 := make(chan int, 10)
+	x := 0
+	go func() {
+		c1 <- x // read of x does not race with...
+		done <- true
+	}()
+	x = <-c1 // ... write to x here
+	<-done
+}
+
+func TestNoRaceProducerConsumerUnbuffered(t *testing.T) {
+	type Task struct {
+		f    func()
+		done chan bool
+	}
+
+	queue := make(chan Task)
+
+	go func() {
+		t := <-queue
+		t.f()
+		t.done <- true
+	}()
+
+	doit := func(f func()) {
+		done := make(chan bool, 1)
+		queue <- Task{f, done}
+		<-done
+	}
+
+	x := 0
+	doit(func() {
+		x = 1
+	})
+	_ = x
+}
+
+func TestRaceChanItselfSend(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int, 10)
+	go func() {
+		c <- 0
+		compl <- true
+	}()
+	c = make(chan int, 20)
+	<-compl
+}
+
+func TestRaceChanItselfRecv(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int, 10)
+	c <- 1
+	go func() {
+		<-c
+		compl <- true
+	}()
+	time.Sleep(1e7)
+	c = make(chan int, 20)
+	<-compl
+}
+
+func TestRaceChanItselfNil(t *testing.T) {
+	c := make(chan int, 10)
+	go func() {
+		c <- 0
+	}()
+	time.Sleep(1e7)
+	c = nil
+	_ = c
+}
+
+func TestRaceChanItselfClose(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int)
+	go func() {
+		close(c)
+		compl <- true
+	}()
+	c = make(chan int)
+	<-compl
+}
+
+func TestRaceChanItselfLen(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int)
+	go func() {
+		_ = len(c)
+		compl <- true
+	}()
+	c = make(chan int)
+	<-compl
+}
+
+func TestRaceChanItselfCap(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int)
+	go func() {
+		_ = cap(c)
+		compl <- true
+	}()
+	c = make(chan int)
+	<-compl
+}
+
+func TestRaceChanCloseLen(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	c <- 0
+	go func() {
+		v = 1
+		close(c)
+	}()
+	time.Sleep(1e7)
+	_ = len(c)
+	v = 2
+}
+
+func TestRaceChanCloseSend(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int, 10)
+	go func() {
+		close(c)
+		compl <- true
+	}()
+	c <- 0
+	<-compl
+}
+
+func TestNoRaceChanMutex(t *testing.T) {
+	done := make(chan struct{})
+	mtx := make(chan struct{}, 1)
+	data := 0
+	go func() {
+		mtx <- struct{}{}
+		data = 42
+		<-mtx
+		done <- struct{}{}
+	}()
+	mtx <- struct{}{}
+	data = 43
+	<-mtx
+	<-done
+}
+
+func TestNoRaceSelectMutex(t *testing.T) {
+	done := make(chan struct{})
+	mtx := make(chan struct{}, 1)
+	aux := make(chan bool)
+	data := 0
+	go func() {
+		select {
+		case mtx <- struct{}{}:
+		case <-aux:
+		}
+		data = 42
+		select {
+		case <-mtx:
+		case <-aux:
+		}
+		done <- struct{}{}
+	}()
+	select {
+	case mtx <- struct{}{}:
+	case <-aux:
+	}
+	data = 43
+	select {
+	case <-mtx:
+	case <-aux:
+	}
+	<-done
+}
+
+func TestRaceChanSem(t *testing.T) {
+	done := make(chan struct{})
+	mtx := make(chan bool, 2)
+	data := 0
+	go func() {
+		mtx <- true
+		data = 42
+		<-mtx
+		done <- struct{}{}
+	}()
+	mtx <- true
+	data = 43
+	<-mtx
+	<-done
+}
+
+func TestNoRaceChanWaitGroup(t *testing.T) {
+	const N = 10
+	chanWg := make(chan bool, N/2)
+	data := make([]int, N)
+	for i := 0; i < N; i++ {
+		chanWg <- true
+		go func(i int) {
+			data[i] = 42
+			<-chanWg
+		}(i)
+	}
+	for i := 0; i < cap(chanWg); i++ {
+		chanWg <- true
+	}
+	for i := 0; i < N; i++ {
+		_ = data[i]
+	}
+}
diff --git a/src/runtime/race/testdata/comp_test.go b/src/runtime/race/testdata/comp_test.go
new file mode 100644
index 000000000..27b2d0081
--- /dev/null
+++ b/src/runtime/race/testdata/comp_test.go
@@ -0,0 +1,186 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"testing"
+)
+
+type P struct {
+	x, y int
+}
+
+type S struct {
+	s1, s2 P
+}
+
+func TestNoRaceComp(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s2.x = 1
+		c <- true
+	}()
+	s.s2.y = 2
+	<-c
+}
+
+func TestNoRaceComp2(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s1.x = 1
+		c <- true
+	}()
+	s.s1.y = 2
+	<-c
+}
+
+func TestRaceComp(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s2.y = 1
+		c <- true
+	}()
+	s.s2.y = 2
+	<-c
+}
+
+func TestRaceComp2(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s1.x = 1
+		c <- true
+	}()
+	s = S{}
+	<-c
+}
+
+func TestRaceComp3(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s2.y = 1
+		c <- true
+	}()
+	s = S{}
+	<-c
+}
+
+func TestRaceCompArray(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]S, 10)
+	x := 4
+	go func() {
+		s[x].s2.y = 1
+		c <- true
+	}()
+	x = 5
+	<-c
+}
+
+type P2 P
+type S2 S
+
+func TestRaceConv1(t *testing.T) {
+	c := make(chan bool, 1)
+	var p P2
+	go func() {
+		p.x = 1
+		c <- true
+	}()
+	_ = P(p).x
+	<-c
+}
+
+func TestRaceConv2(t *testing.T) {
+	c := make(chan bool, 1)
+	var p P2
+	go func() {
+		p.x = 1
+		c <- true
+	}()
+	ptr := &p
+	_ = P(*ptr).x
+	<-c
+}
+
+func TestRaceConv3(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S2
+	go func() {
+		s.s1.x = 1
+		c <- true
+	}()
+	_ = P2(S(s).s1).x
+	<-c
+}
+
+type X struct {
+	V [4]P
+}
+
+type X2 X
+
+func TestRaceConv4(t *testing.T) {
+	c := make(chan bool, 1)
+	var x X2
+	go func() {
+		x.V[1].x = 1
+		c <- true
+	}()
+	_ = P2(X(x).V[1]).x
+	<-c
+}
+
+type Ptr struct {
+	s1, s2 *P
+}
+
+func TestNoRaceCompPtr(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s1.x = 1
+		c <- true
+	}()
+	p.s1.y = 2
+	<-c
+}
+
+func TestNoRaceCompPtr2(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s1.x = 1
+		c <- true
+	}()
+	_ = p
+	<-c
+}
+
+func TestRaceCompPtr(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s2.x = 1
+		c <- true
+	}()
+	p.s2.x = 2
+	<-c
+}
+
+func TestRaceCompPtr2(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s2.x = 1
+		c <- true
+	}()
+	p.s2 = &P{}
+	<-c
+}
diff --git a/src/runtime/race/testdata/finalizer_test.go b/src/runtime/race/testdata/finalizer_test.go
new file mode 100644
index 000000000..222cbf67a
--- /dev/null
+++ b/src/runtime/race/testdata/finalizer_test.go
@@ -0,0 +1,67 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceFin(t *testing.T) {
+	c := make(chan bool)
+	go func() {
+		x := new(string)
+		runtime.SetFinalizer(x, func(x *string) {
+			*x = "foo"
+		})
+		*x = "bar"
+		c <- true
+	}()
+	<-c
+	runtime.GC()
+	time.Sleep(100 * time.Millisecond)
+}
+
+var finVar struct {
+	sync.Mutex
+	cnt int
+}
+
+func TestNoRaceFinGlobal(t *testing.T) {
+	c := make(chan bool)
+	go func() {
+		x := new(string)
+		runtime.SetFinalizer(x, func(x *string) {
+			finVar.Lock()
+			finVar.cnt++
+			finVar.Unlock()
+		})
+		c <- true
+	}()
+	<-c
+	runtime.GC()
+	time.Sleep(100 * time.Millisecond)
+	finVar.Lock()
+	finVar.cnt++
+	finVar.Unlock()
+}
+
+func TestRaceFin(t *testing.T) {
+	c := make(chan bool)
+	y := 0
+	go func() {
+		x := new(string)
+		runtime.SetFinalizer(x, func(x *string) {
+			y = 42
+		})
+		c <- true
+	}()
+	<-c
+	runtime.GC()
+	time.Sleep(100 * time.Millisecond)
+	y = 66
+}
diff --git a/src/runtime/race/testdata/io_test.go b/src/runtime/race/testdata/io_test.go
new file mode 100644
index 000000000..9eb3552dc
--- /dev/null
+++ b/src/runtime/race/testdata/io_test.go
@@ -0,0 +1,69 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+func TestNoRaceIOFile(t *testing.T) {
+	x := 0
+	path, _ := ioutil.TempDir("", "race_test")
+	fname := filepath.Join(path, "data")
+	go func() {
+		x = 42
+		f, _ := os.Create(fname)
+		f.Write([]byte("done"))
+		f.Close()
+	}()
+	for {
+		f, err := os.Open(fname)
+		if err != nil {
+			time.Sleep(1e6)
+			continue
+		}
+		buf := make([]byte, 100)
+		count, err := f.Read(buf)
+		if count == 0 {
+			time.Sleep(1e6)
+			continue
+		}
+		break
+	}
+	_ = x
+}
+
+func TestNoRaceIOHttp(t *testing.T) {
+	x := 0
+	go func() {
+		http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+			x = 41
+			fmt.Fprintf(w, "test")
+			x = 42
+		})
+		err := http.ListenAndServe(":23651", nil)
+		if err != nil {
+			t.Fatalf("http.ListenAndServe: %v", err)
+		}
+	}()
+	time.Sleep(1e7)
+	x = 1
+	_, err := http.Get("http://127.0.0.1:23651")
+	if err != nil {
+		t.Fatalf("http.Get: %v", err)
+	}
+	x = 2
+	_, err = http.Get("http://127.0.0.1:23651")
+	if err != nil {
+		t.Fatalf("http.Get: %v", err)
+	}
+	x = 3
+}
diff --git a/src/runtime/race/testdata/map_test.go b/src/runtime/race/testdata/map_test.go
new file mode 100644
index 000000000..a8d8148d0
--- /dev/null
+++ b/src/runtime/race/testdata/map_test.go
@@ -0,0 +1,333 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"testing"
+)
+
+func TestRaceMapRW(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_ = m[1]
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestRaceMapRW2(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = m[1]
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestRaceMapRWArray(t *testing.T) {
+	// Check instrumentation of unaddressable arrays (issue 4578).
+	m := make(map[int][2]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_ = m[1][1]
+		ch <- true
+	}()
+	m[2] = [2]int{1, 2}
+	<-ch
+}
+
+func TestNoRaceMapRR(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = m[1]
+		ch <- true
+	}()
+	_ = m[1]
+	<-ch
+}
+
+func TestRaceMapRange(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		for range m {
+		}
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestRaceMapRange2(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		for range m {
+		}
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestNoRaceMapRangeRange(t *testing.T) {
+	m := make(map[int]int)
+	// now the map is not empty and range triggers an event
+	// should work without this (as in other tests)
+	// so it is suspicious if this test passes and others don't
+	m[0] = 0
+	ch := make(chan bool, 1)
+	go func() {
+		for range m {
+		}
+		ch <- true
+	}()
+	for range m {
+	}
+	<-ch
+}
+
+func TestRaceMapLen(t *testing.T) {
+	m := make(map[string]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		_ = len(m)
+		ch <- true
+	}()
+	m[""] = true
+	<-ch
+}
+
+func TestRaceMapDelete(t *testing.T) {
+	m := make(map[string]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		delete(m, "")
+		ch <- true
+	}()
+	m[""] = true
+	<-ch
+}
+
+func TestRaceMapLenDelete(t *testing.T) {
+	m := make(map[string]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		delete(m, "a")
+		ch <- true
+	}()
+	_ = len(m)
+	<-ch
+}
+
+func TestRaceMapVariable(t *testing.T) {
+	ch := make(chan bool, 1)
+	m := make(map[int]int)
+	go func() {
+		m = make(map[int]int)
+		ch <- true
+	}()
+	m = make(map[int]int)
+	<-ch
+}
+
+func TestRaceMapVariable2(t *testing.T) {
+	ch := make(chan bool, 1)
+	m := make(map[int]int)
+	go func() {
+		m[1] = 1
+		ch <- true
+	}()
+	m = make(map[int]int)
+	<-ch
+}
+
+func TestRaceMapVariable3(t *testing.T) {
+	ch := make(chan bool, 1)
+	m := make(map[int]int)
+	go func() {
+		_ = m[1]
+		ch <- true
+	}()
+	m = make(map[int]int)
+	<-ch
+}
+
+type Big struct {
+	x [17]int32
+}
+
+func TestRaceMapLookupPartKey(t *testing.T) {
+	k := &Big{}
+	m := make(map[Big]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		k.x[8] = 1
+		ch <- true
+	}()
+	_ = m[*k]
+	<-ch
+}
+
+func TestRaceMapLookupPartKey2(t *testing.T) {
+	k := &Big{}
+	m := make(map[Big]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		k.x[8] = 1
+		ch <- true
+	}()
+	_, _ = m[*k]
+	<-ch
+}
+func TestRaceMapDeletePartKey(t *testing.T) {
+	k := &Big{}
+	m := make(map[Big]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		k.x[8] = 1
+		ch <- true
+	}()
+	delete(m, *k)
+	<-ch
+}
+
+func TestRaceMapInsertPartKey(t *testing.T) {
+	k := &Big{}
+	m := make(map[Big]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		k.x[8] = 1
+		ch <- true
+	}()
+	m[*k] = true
+	<-ch
+}
+
+func TestRaceMapInsertPartVal(t *testing.T) {
+	v := &Big{}
+	m := make(map[int]Big)
+	ch := make(chan bool, 1)
+	go func() {
+		v.x[8] = 1
+		ch <- true
+	}()
+	m[1] = *v
+	<-ch
+}
+
+// Test for issue 7561.
+func TestRaceMapAssignMultipleReturn(t *testing.T) {
+	connect := func() (int, error) { return 42, nil }
+	conns := make(map[int][]int)
+	conns[1] = []int{0}
+	ch := make(chan bool, 1)
+	var err error
+	go func() {
+		conns[1][0], err = connect()
+		ch <- true
+	}()
+	x := conns[1][0]
+	_ = x
+	<-ch
+}
+
+// BigKey and BigVal must be larger than 256 bytes,
+// so that compiler sets KindGCProg for them.
+type BigKey [1000]*int
+
+type BigVal struct {
+	x int
+	y [1000]*int
+}
+
+func TestRaceMapBigKeyAccess1(t *testing.T) {
+	m := make(map[BigKey]int)
+	var k BigKey
+	ch := make(chan bool, 1)
+	go func() {
+		_ = m[k]
+		ch <- true
+	}()
+	k[30] = new(int)
+	<-ch
+}
+
+func TestRaceMapBigKeyAccess2(t *testing.T) {
+	m := make(map[BigKey]int)
+	var k BigKey
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = m[k]
+		ch <- true
+	}()
+	k[30] = new(int)
+	<-ch
+}
+
+func TestRaceMapBigKeyInsert(t *testing.T) {
+	m := make(map[BigKey]int)
+	var k BigKey
+	ch := make(chan bool, 1)
+	go func() {
+		m[k] = 1
+		ch <- true
+	}()
+	k[30] = new(int)
+	<-ch
+}
+
+func TestRaceMapBigKeyDelete(t *testing.T) {
+	m := make(map[BigKey]int)
+	var k BigKey
+	ch := make(chan bool, 1)
+	go func() {
+		delete(m, k)
+		ch <- true
+	}()
+	k[30] = new(int)
+	<-ch
+}
+
+func TestRaceMapBigValInsert(t *testing.T) {
+	m := make(map[int]BigVal)
+	var v BigVal
+	ch := make(chan bool, 1)
+	go func() {
+		m[1] = v
+		ch <- true
+	}()
+	v.y[30] = new(int)
+	<-ch
+}
+
+func TestRaceMapBigValAccess1(t *testing.T) {
+	m := make(map[int]BigVal)
+	var v BigVal
+	ch := make(chan bool, 1)
+	go func() {
+		v = m[1]
+		ch <- true
+	}()
+	v.y[30] = new(int)
+	<-ch
+}
+
+func TestRaceMapBigValAccess2(t *testing.T) {
+	m := make(map[int]BigVal)
+	var v BigVal
+	ch := make(chan bool, 1)
+	go func() {
+		v, _ = m[1]
+		ch <- true
+	}()
+	v.y[30] = new(int)
+	<-ch
+}
diff --git a/src/runtime/race/testdata/mop_test.go b/src/runtime/race/testdata/mop_test.go
new file mode 100644
index 000000000..cb17a27d3
--- /dev/null
+++ b/src/runtime/race/testdata/mop_test.go
@@ -0,0 +1,1957 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"bytes"
+	"crypto/sha1"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"runtime"
+	"sync"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+type Point struct {
+	x, y int
+}
+
+type NamedPoint struct {
+	name string
+	p    Point
+}
+
+type DummyWriter struct {
+	state int
+}
+type Writer interface {
+	Write(p []byte) (n int)
+}
+
+func (d DummyWriter) Write(p []byte) (n int) {
+	return 0
+}
+
+var GlobalX, GlobalY int = 0, 0
+var GlobalCh chan int = make(chan int, 2)
+
+func GlobalFunc1() {
+	GlobalY = GlobalX
+	GlobalCh <- 1
+}
+
+func GlobalFunc2() {
+	GlobalX = 1
+	GlobalCh <- 1
+}
+
+func TestRaceIntRWGlobalFuncs(t *testing.T) {
+	go GlobalFunc1()
+	go GlobalFunc2()
+	<-GlobalCh
+	<-GlobalCh
+}
+
+func TestRaceIntRWClosures(t *testing.T) {
+	var x, y int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		x = 1
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceIntRWClosures(t *testing.T) {
+	var x, y int
+	ch := make(chan int, 1)
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	<-ch
+	go func() {
+		x = 1
+		ch <- 1
+	}()
+	<-ch
+
+}
+
+func TestRaceInt32RWClosures(t *testing.T) {
+	var x, y int32
+	ch := make(chan bool, 2)
+
+	go func() {
+		y = x
+		ch <- true
+	}()
+	go func() {
+		x = 1
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceCase(t *testing.T) {
+	var y int
+	for x := -1; x <= 1; x++ {
+		switch {
+		case x < 0:
+			y = -1
+		case x == 0:
+			y = 0
+		case x > 0:
+			y = 1
+		}
+	}
+	y++
+}
+
+func TestRaceCaseCondition(t *testing.T) {
+	var x int = 0
+	ch := make(chan int, 2)
+
+	go func() {
+		x = 2
+		ch <- 1
+	}()
+	go func() {
+		switch x < 2 {
+		case true:
+			x = 1
+			//case false:
+			//	x = 5
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseCondition2(t *testing.T) {
+	// switch body is rearranged by the compiler so the tests
+	// passes even if we don't instrument '<'
+	var x int = 0
+	ch := make(chan int, 2)
+
+	go func() {
+		x = 2
+		ch <- 1
+	}()
+	go func() {
+		switch x < 2 {
+		case true:
+			x = 1
+		case false:
+			x = 5
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseBody(t *testing.T) {
+	var x, y int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		switch {
+		default:
+			x = 1
+		case x == 100:
+			x = -x
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceCaseFallthrough(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+	z = 1
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		switch {
+		case z == 1:
+		case z == 2:
+			x = 2
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseFallthrough(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+	z = 1
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		switch {
+		case z == 1:
+			fallthrough
+		case z == 2:
+			x = 2
+		}
+		ch <- 1
+	}()
+
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseIssue6418(t *testing.T) {
+	m := map[string]map[string]string{
+		"a": {
+			"b": "c",
+		},
+	}
+	ch := make(chan int)
+	go func() {
+		m["a"]["x"] = "y"
+		ch <- 1
+	}()
+	switch m["a"]["b"] {
+	}
+	<-ch
+}
+
+func TestRaceCaseType(t *testing.T) {
+	var x, y int
+	var i interface{} = x
+	c := make(chan int, 1)
+	go func() {
+		switch i.(type) {
+		case nil:
+		case int:
+		}
+		c <- 1
+	}()
+	i = y
+	<-c
+}
+
+func TestRaceCaseTypeBody(t *testing.T) {
+	var x, y int
+	var i interface{} = &x
+	c := make(chan int, 1)
+	go func() {
+		switch i := i.(type) {
+		case nil:
+		case *int:
+			*i = y
+		}
+		c <- 1
+	}()
+	x = y
+	<-c
+}
+
+func TestRaceCaseTypeIssue5890(t *testing.T) {
+	// spurious extra instrumentation of the initial interface
+	// value.
+	var x, y int
+	m := make(map[int]map[int]interface{})
+	m[0] = make(map[int]interface{})
+	c := make(chan int, 1)
+	go func() {
+		switch i := m[0][1].(type) {
+		case nil:
+		case *int:
+			*i = x
+		}
+		c <- 1
+	}()
+	m[0][1] = y
+	<-c
+}
+
+func TestNoRaceRange(t *testing.T) {
+	ch := make(chan int, 3)
+	a := [...]int{1, 2, 3}
+	for _, v := range a {
+		ch <- v
+	}
+	close(ch)
+}
+
+func TestNoRaceRangeIssue5446(t *testing.T) {
+	ch := make(chan int, 3)
+	a := []int{1, 2, 3}
+	b := []int{4}
+	// used to insert a spurious instrumentation of a[i]
+	// and crash.
+	i := 1
+	for i, a[i] = range b {
+		ch <- i
+	}
+	close(ch)
+}
+
+func TestRaceRange(t *testing.T) {
+	const N = 2
+	var a [N]int
+	var x, y int
+	done := make(chan bool, N)
+	for i, v := range a {
+		go func(i int) {
+			// we don't want a write-vs-write race
+			// so there is no array b here
+			if i == 0 {
+				x = v
+			} else {
+				y = v
+			}
+			done <- true
+		}(i)
+	}
+	for i := 0; i < N; i++ {
+		<-done
+	}
+}
+
+func TestRaceForInit(t *testing.T) {
+	c := make(chan int)
+	x := 0
+	go func() {
+		c <- x
+	}()
+	for x = 42; false; {
+	}
+	<-c
+}
+
+func TestNoRaceForInit(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	x := 0
+	go func() {
+		for {
+			_, ok := <-c
+			if !ok {
+				done <- true
+				return
+			}
+			x++
+		}
+	}()
+	i := 0
+	for x = 42; i < 10; i++ {
+		c <- true
+	}
+	close(c)
+	<-done
+}
+
+func TestRaceForTest(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	stop := false
+	go func() {
+		for {
+			_, ok := <-c
+			if !ok {
+				done <- true
+				return
+			}
+			stop = true
+		}
+	}()
+	for !stop {
+		c <- true
+	}
+	close(c)
+	<-done
+}
+
+func TestRaceForIncr(t *testing.T) {
+	done := make(chan bool)
+	c := make(chan bool)
+	x := 0
+	go func() {
+		for {
+			_, ok := <-c
+			if !ok {
+				done <- true
+				return
+			}
+			x++
+		}
+	}()
+	for i := 0; i < 10; x++ {
+		i++
+		c <- true
+	}
+	close(c)
+	<-done
+}
+
+func TestNoRaceForIncr(t *testing.T) {
+	done := make(chan bool)
+	x := 0
+	go func() {
+		x++
+		done <- true
+	}()
+	for i := 0; i < 0; x++ {
+	}
+	<-done
+}
+
+func TestRacePlus(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x + z
+		ch <- 1
+	}()
+	go func() {
+		y = x + z + z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRacePlus2(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		x = 1
+		ch <- 1
+	}()
+	go func() {
+		y = +x + z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRacePlus(t *testing.T) {
+	var x, y, z, f int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x + z
+		ch <- 1
+	}()
+	go func() {
+		f = z + x
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceComplement(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		x = ^y
+		ch <- 1
+	}()
+	go func() {
+		y = ^z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceDiv(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		x = y / (z + 1)
+		ch <- 1
+	}()
+	go func() {
+		y = z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceDivConst(t *testing.T) {
+	var x, y, z uint32
+	ch := make(chan int, 2)
+
+	go func() {
+		x = y / 3 // involves only a HMUL node
+		ch <- 1
+	}()
+	go func() {
+		y = z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceMod(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		x = y % (z + 1)
+		ch <- 1
+	}()
+	go func() {
+		y = z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceModConst(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		x = y % 3
+		ch <- 1
+	}()
+	go func() {
+		y = z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceRotate(t *testing.T) {
+	var x, y, z uint32
+	ch := make(chan int, 2)
+
+	go func() {
+		x = y<<12 | y>>20
+		ch <- 1
+	}()
+	go func() {
+		y = z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+// May crash if the instrumentation is reckless.
+func TestNoRaceEnoughRegisters(t *testing.T) {
+	// from erf.go
+	const (
+		sa1 = 1
+		sa2 = 2
+		sa3 = 3
+		sa4 = 4
+		sa5 = 5
+		sa6 = 6
+		sa7 = 7
+		sa8 = 8
+	)
+	var s, S float64
+	s = 3.1415
+	S = 1 + s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))))
+	s = S
+}
+
+// emptyFunc should not be inlined.
+func emptyFunc(x int) {
+	if false {
+		fmt.Println(x)
+	}
+}
+
+func TestRaceFuncArgument(t *testing.T) {
+	var x int
+	ch := make(chan bool, 1)
+	go func() {
+		emptyFunc(x)
+		ch <- true
+	}()
+	x = 1
+	<-ch
+}
+
+func TestRaceFuncArgument2(t *testing.T) {
+	var x int
+	ch := make(chan bool, 2)
+	go func() {
+		x = 42
+		ch <- true
+	}()
+	go func(y int) {
+		ch <- true
+	}(x)
+	<-ch
+	<-ch
+}
+
+func TestRaceSprint(t *testing.T) {
+	var x int
+	ch := make(chan bool, 1)
+	go func() {
+		fmt.Sprint(x)
+		ch <- true
+	}()
+	x = 1
+	<-ch
+}
+
+func TestRaceArrayCopy(t *testing.T) {
+	ch := make(chan bool, 1)
+	var a [5]int
+	go func() {
+		a[3] = 1
+		ch <- true
+	}()
+	a = [5]int{1, 2, 3, 4, 5}
+	<-ch
+}
+
+// Blows up a naive compiler.
+func TestRaceNestedArrayCopy(t *testing.T) {
+	ch := make(chan bool, 1)
+	type (
+		Point32   [2][2][2][2][2]Point
+		Point1024 [2][2][2][2][2]Point32
+		Point32k  [2][2][2][2][2]Point1024
+		Point1M   [2][2][2][2][2]Point32k
+	)
+	var a, b Point1M
+	go func() {
+		a[0][1][0][1][0][1][0][1][0][1][0][1][0][1][0][1][0][1][0][1].y = 1
+		ch <- true
+	}()
+	a = b
+	<-ch
+}
+
+func TestRaceStructRW(t *testing.T) {
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p = Point{1, 1}
+		ch <- true
+	}()
+	q := p
+	<-ch
+	p = q
+}
+
+func TestRaceStructFieldRW1(t *testing.T) {
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	_ = p.x
+	<-ch
+}
+
+func TestNoRaceStructFieldRW1(t *testing.T) {
+	// Same struct, different variables, no
+	// pointers. The layout is known (at compile time?) ->
+	// no read on p
+	// writes on x and y
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	p.y = 1
+	<-ch
+	_ = p
+}
+
+func TestNoRaceStructFieldRW2(t *testing.T) {
+	// Same as NoRaceStructFieldRW1
+	// but p is a pointer, so there is a read on p
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	p.y = 1
+	<-ch
+	_ = p
+}
+
+func TestRaceStructFieldRW2(t *testing.T) {
+	p := &Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	_ = p.x
+	<-ch
+}
+
+func TestRaceStructFieldRW3(t *testing.T) {
+	p := NamedPoint{name: "a", p: Point{0, 0}}
+	ch := make(chan bool, 1)
+	go func() {
+		p.p.x = 1
+		ch <- true
+	}()
+	_ = p.p.x
+	<-ch
+}
+
+func TestRaceEfaceWW(t *testing.T) {
+	var a, b interface{}
+	ch := make(chan bool, 1)
+	go func() {
+		a = 1
+		ch <- true
+	}()
+	a = 2
+	<-ch
+	_, _ = a, b
+}
+
+func TestRaceIfaceWW(t *testing.T) {
+	var a, b Writer
+	ch := make(chan bool, 1)
+	go func() {
+		a = DummyWriter{1}
+		ch <- true
+	}()
+	a = DummyWriter{2}
+	<-ch
+	b = a
+	a = b
+}
+
+func TestRaceIfaceCmp(t *testing.T) {
+	var a, b Writer
+	a = DummyWriter{1}
+	ch := make(chan bool, 1)
+	go func() {
+		a = DummyWriter{1}
+		ch <- true
+	}()
+	_ = a == b
+	<-ch
+}
+
+func TestRaceIfaceCmpNil(t *testing.T) {
+	var a Writer
+	a = DummyWriter{1}
+	ch := make(chan bool, 1)
+	go func() {
+		a = DummyWriter{1}
+		ch <- true
+	}()
+	_ = a == nil
+	<-ch
+}
+
+func TestRaceEfaceConv(t *testing.T) {
+	c := make(chan bool)
+	v := 0
+	go func() {
+		go func(x interface{}) {
+		}(v)
+		c <- true
+	}()
+	v = 42
+	<-c
+}
+
+type OsFile struct{}
+
+func (*OsFile) Read() {
+}
+
+type IoReader interface {
+	Read()
+}
+
+func TestRaceIfaceConv(t *testing.T) {
+	c := make(chan bool)
+	f := &OsFile{}
+	go func() {
+		go func(x IoReader) {
+		}(f)
+		c <- true
+	}()
+	f = &OsFile{}
+	<-c
+}
+
+func TestRaceError(t *testing.T) {
+	ch := make(chan bool, 1)
+	var err error
+	go func() {
+		err = nil
+		ch <- true
+	}()
+	_ = err
+	<-ch
+}
+
+func TestRaceIntptrRW(t *testing.T) {
+	var x, y int
+	var p *int = &x
+	ch := make(chan bool, 1)
+	go func() {
+		*p = 5
+		ch <- true
+	}()
+	y = *p
+	x = y
+	<-ch
+}
+
+func TestRaceStringRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	s := ""
+	go func() {
+		s = "abacaba"
+		ch <- true
+	}()
+	_ = s
+	<-ch
+}
+
+func TestRaceStringPtrRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	var x string
+	p := &x
+	go func() {
+		*p = "a"
+		ch <- true
+	}()
+	_ = *p
+	<-ch
+}
+
+func TestRaceFloat64WW(t *testing.T) {
+	var x, y float64
+	ch := make(chan bool, 1)
+	go func() {
+		x = 1.0
+		ch <- true
+	}()
+	x = 2.0
+	<-ch
+
+	y = x
+	x = y
+}
+
+func TestRaceComplex128WW(t *testing.T) {
+	var x, y complex128
+	ch := make(chan bool, 1)
+	go func() {
+		x = 2 + 2i
+		ch <- true
+	}()
+	x = 4 + 4i
+	<-ch
+
+	y = x
+	x = y
+}
+
+func TestRaceUnsafePtrRW(t *testing.T) {
+	var x, y, z int
+	x, y, z = 1, 2, 3
+	var p unsafe.Pointer = unsafe.Pointer(&x)
+	ch := make(chan bool, 1)
+	go func() {
+		p = (unsafe.Pointer)(&z)
+		ch <- true
+	}()
+	y = *(*int)(p)
+	x = y
+	<-ch
+}
+
+func TestRaceFuncVariableRW(t *testing.T) {
+	var f func(x int) int
+	f = func(x int) int {
+		return x * x
+	}
+	ch := make(chan bool, 1)
+	go func() {
+		f = func(x int) int {
+			return x
+		}
+		ch <- true
+	}()
+	y := f(1)
+	<-ch
+	x := y
+	y = x
+}
+
+func TestRaceFuncVariableWW(t *testing.T) {
+	var f func(x int) int
+	ch := make(chan bool, 1)
+	go func() {
+		f = func(x int) int {
+			return x
+		}
+		ch <- true
+	}()
+	f = func(x int) int {
+		return x * x
+	}
+	<-ch
+}
+
+// This one should not belong to mop_test
+func TestRacePanic(t *testing.T) {
+	var x int
+	var zero int = 0
+	ch := make(chan bool, 2)
+	go func() {
+		defer func() {
+			err := recover()
+			if err == nil {
+				panic("should be panicking")
+			}
+			x = 1
+			ch <- true
+		}()
+		var y int = 1 / zero
+		zero = y
+	}()
+	go func() {
+		defer func() {
+			err := recover()
+			if err == nil {
+				panic("should be panicking")
+			}
+			x = 2
+			ch <- true
+		}()
+		var y int = 1 / zero
+		zero = y
+	}()
+
+	<-ch
+	<-ch
+	if zero != 0 {
+		panic("zero has changed")
+	}
+}
+
+func TestNoRaceBlank(t *testing.T) {
+	var a [5]int
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = a[0], a[1]
+		ch <- true
+	}()
+	_, _ = a[2], a[3]
+	<-ch
+	a[1] = a[0]
+}
+
+func TestRaceAppendRW(t *testing.T) {
+	a := make([]int, 10)
+	ch := make(chan bool)
+	go func() {
+		_ = append(a, 1)
+		ch <- true
+	}()
+	a[0] = 1
+	<-ch
+}
+
+func TestRaceAppendLenRW(t *testing.T) {
+	a := make([]int, 0)
+	ch := make(chan bool)
+	go func() {
+		a = append(a, 1)
+		ch <- true
+	}()
+	_ = len(a)
+	<-ch
+}
+
+func TestRaceAppendCapRW(t *testing.T) {
+	a := make([]int, 0)
+	ch := make(chan string)
+	go func() {
+		a = append(a, 1)
+		ch <- ""
+	}()
+	_ = cap(a)
+	<-ch
+}
+
+func TestNoRaceFuncArgsRW(t *testing.T) {
+	ch := make(chan byte, 1)
+	var x byte
+	go func(y byte) {
+		_ = y
+		ch <- 0
+	}(x)
+	x = 1
+	<-ch
+}
+
+func TestRaceFuncArgsRW(t *testing.T) {
+	ch := make(chan byte, 1)
+	var x byte
+	go func(y *byte) {
+		_ = *y
+		ch <- 0
+	}(&x)
+	x = 1
+	<-ch
+}
+
+// from the mailing list, slightly modified
+// unprotected concurrent access to seen[]
+func TestRaceCrawl(t *testing.T) {
+	url := "dummyurl"
+	depth := 3
+	seen := make(map[string]bool)
+	ch := make(chan int, 100)
+	var wg sync.WaitGroup
+	var crawl func(string, int)
+	crawl = func(u string, d int) {
+		nurl := 0
+		defer func() {
+			ch <- nurl
+		}()
+		seen[u] = true
+		if d <= 0 {
+			return
+		}
+		urls := [...]string{"a", "b", "c"}
+		for _, uu := range urls {
+			if _, ok := seen[uu]; !ok {
+				wg.Add(1)
+				go crawl(uu, d-1)
+				nurl++
+			}
+		}
+		wg.Done()
+	}
+	wg.Add(1)
+	go crawl(url, depth)
+	wg.Wait()
+}
+
+func TestRaceIndirection(t *testing.T) {
+	ch := make(chan struct{}, 1)
+	var y int
+	var x *int = &y
+	go func() {
+		*x = 1
+		ch <- struct{}{}
+	}()
+	*x = 2
+	<-ch
+	_ = *x
+}
+
+func TestRaceRune(t *testing.T) {
+	c := make(chan bool)
+	var x rune
+	go func() {
+		x = 1
+		c <- true
+	}()
+	_ = x
+	<-c
+}
+
+func TestRaceEmptyInterface1(t *testing.T) {
+	c := make(chan bool)
+	var x interface{}
+	go func() {
+		x = nil
+		c <- true
+	}()
+	_ = x
+	<-c
+}
+
+func TestRaceEmptyInterface2(t *testing.T) {
+	c := make(chan bool)
+	var x interface{}
+	go func() {
+		x = &Point{}
+		c <- true
+	}()
+	_ = x
+	<-c
+}
+
+func TestRaceTLS(t *testing.T) {
+	comm := make(chan *int)
+	done := make(chan bool, 2)
+	go func() {
+		var x int
+		comm <- &x
+		x = 1
+		x = *(<-comm)
+		done <- true
+	}()
+	go func() {
+		p := <-comm
+		*p = 2
+		comm <- p
+		done <- true
+	}()
+	<-done
+	<-done
+}
+
+func TestNoRaceHeapReallocation(t *testing.T) {
+	// It is possible that a future implementation
+	// of memory allocation will ruin this test.
+	// Increasing n might help in this case, so
+	// this test is a bit more generic than most of the
+	// others.
+	const n = 2
+	done := make(chan bool, n)
+	empty := func(p *int) {}
+	for i := 0; i < n; i++ {
+		ms := i
+		go func() {
+			<-time.After(time.Duration(ms) * time.Millisecond)
+			runtime.GC()
+			var x int
+			empty(&x) // x goes to the heap
+			done <- true
+		}()
+	}
+	for i := 0; i < n; i++ {
+		<-done
+	}
+}
+
+func TestRaceAnd(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if x == 1 && y == 1 {
+	}
+	<-c
+}
+
+func TestRaceAnd2(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 0 && x == 1 {
+	}
+	<-c
+}
+
+func TestNoRaceAnd(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 1 && x == 1 {
+	}
+	<-c
+}
+
+func TestRaceOr(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if x == 1 || y == 1 {
+	}
+	<-c
+}
+
+func TestRaceOr2(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 1 || x == 1 {
+	}
+	<-c
+}
+
+func TestNoRaceOr(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 0 || x == 1 {
+	}
+	<-c
+}
+
+func TestNoRaceShortCalc(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		y = 1
+		c <- true
+	}()
+	if x == 0 || y == 0 {
+	}
+	<-c
+}
+
+func TestNoRaceShortCalc2(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		y = 1
+		c <- true
+	}()
+	if x == 1 && y == 0 {
+	}
+	<-c
+}
+
+func TestRaceFuncItself(t *testing.T) {
+	c := make(chan bool)
+	f := func() {}
+	go func() {
+		f()
+		c <- true
+	}()
+	f = func() {}
+	<-c
+}
+
+func TestNoRaceFuncUnlock(t *testing.T) {
+	ch := make(chan bool, 1)
+	var mu sync.Mutex
+	x := 0
+	go func() {
+		mu.Lock()
+		x = 42
+		mu.Unlock()
+		ch <- true
+	}()
+	x = func(mu *sync.Mutex) int {
+		mu.Lock()
+		return 43
+	}(&mu)
+	mu.Unlock()
+	<-ch
+}
+
+func TestRaceStructInit(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := X{x: y}
+	_ = x
+	<-c
+}
+
+func TestRaceArrayInit(t *testing.T) {
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := []int{0, y, 42}
+	_ = x
+	<-c
+}
+
+func TestRaceMapInit(t *testing.T) {
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := map[int]int{0: 42, y: 42}
+	_ = x
+	<-c
+}
+
+func TestRaceMapInit2(t *testing.T) {
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := map[int]int{0: 42, 42: y}
+	_ = x
+	<-c
+}
+
+type Inter interface {
+	Foo(x int)
+}
+type InterImpl struct {
+	x, y int
+}
+
+func (p InterImpl) Foo(x int) {
+	// prevent inlining
+	z := 42
+	x = 85
+	y := x / z
+	z = y * z
+	x = z * y
+	_, _, _ = x, y, z
+}
+
+type InterImpl2 InterImpl
+
+func (p *InterImpl2) Foo(x int) {
+	if p == nil {
+		InterImpl{}.Foo(x)
+	}
+	InterImpl(*p).Foo(x)
+}
+
+func TestRaceInterCall(t *testing.T) {
+	c := make(chan bool, 1)
+	p := InterImpl{}
+	var x Inter = p
+	go func() {
+		p2 := InterImpl{}
+		x = p2
+		c <- true
+	}()
+	x.Foo(0)
+	<-c
+}
+
+func TestRaceInterCall2(t *testing.T) {
+	c := make(chan bool, 1)
+	p := InterImpl{}
+	var x Inter = p
+	z := 0
+	go func() {
+		z = 42
+		c <- true
+	}()
+	x.Foo(z)
+	<-c
+}
+
+func TestRaceFuncCall(t *testing.T) {
+	c := make(chan bool, 1)
+	f := func(x, y int) {}
+	x, y := 0, 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	f(x, y)
+	<-c
+}
+
+func TestRaceMethodCall(t *testing.T) {
+	c := make(chan bool, 1)
+	i := InterImpl{}
+	x := 0
+	go func() {
+		x = 42
+		c <- true
+	}()
+	i.Foo(x)
+	<-c
+}
+
+func TestRaceMethodCall2(t *testing.T) {
+	c := make(chan bool, 1)
+	i := &InterImpl{}
+	go func() {
+		i = &InterImpl{}
+		c <- true
+	}()
+	i.Foo(0)
+	<-c
+}
+
+// Method value with concrete value receiver.
+func TestRaceMethodValue(t *testing.T) {
+	c := make(chan bool, 1)
+	i := InterImpl{}
+	go func() {
+		i = InterImpl{}
+		c <- true
+	}()
+	_ = i.Foo
+	<-c
+}
+
+// Method value with interface receiver.
+func TestRaceMethodValue2(t *testing.T) {
+	c := make(chan bool, 1)
+	var i Inter = InterImpl{}
+	go func() {
+		i = InterImpl{}
+		c <- true
+	}()
+	_ = i.Foo
+	<-c
+}
+
+// Method value with implicit dereference.
+func TestRaceMethodValue3(t *testing.T) {
+	c := make(chan bool, 1)
+	i := &InterImpl{}
+	go func() {
+		*i = InterImpl{}
+		c <- true
+	}()
+	_ = i.Foo // dereferences i.
+	<-c
+}
+
+// Method value implicitly taking receiver address.
+func TestNoRaceMethodValue(t *testing.T) {
+	c := make(chan bool, 1)
+	i := InterImpl2{}
+	go func() {
+		i = InterImpl2{}
+		c <- true
+	}()
+	_ = i.Foo // takes the address of i only.
+	<-c
+}
+
+func TestRacePanicArg(t *testing.T) {
+	c := make(chan bool, 1)
+	err := errors.New("err")
+	go func() {
+		err = errors.New("err2")
+		c <- true
+	}()
+	defer func() {
+		recover()
+		<-c
+	}()
+	panic(err)
+}
+
+func TestRaceDeferArg(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	go func() {
+		x = 42
+		c <- true
+	}()
+	func() {
+		defer func(x int) {
+		}(x)
+	}()
+	<-c
+}
+
+type DeferT int
+
+func (d DeferT) Foo() {
+}
+
+func TestRaceDeferArg2(t *testing.T) {
+	c := make(chan bool, 1)
+	var x DeferT
+	go func() {
+		var y DeferT
+		x = y
+		c <- true
+	}()
+	func() {
+		defer x.Foo()
+	}()
+	<-c
+}
+
+func TestNoRaceAddrExpr(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	go func() {
+		x = 42
+		c <- true
+	}()
+	_ = &x
+	<-c
+}
+
+type AddrT struct {
+	_ [256]byte
+	x int
+}
+
+type AddrT2 struct {
+	_ [512]byte
+	p *AddrT
+}
+
+func TestRaceAddrExpr(t *testing.T) {
+	c := make(chan bool, 1)
+	a := AddrT2{p: &AddrT{x: 42}}
+	go func() {
+		a.p = &AddrT{x: 43}
+		c <- true
+	}()
+	_ = &a.p.x
+	<-c
+}
+
+func TestRaceTypeAssert(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	var i interface{} = x
+	go func() {
+		y := 0
+		i = y
+		c <- true
+	}()
+	_ = i.(int)
+	<-c
+}
+
+func TestRaceBlockAs(t *testing.T) {
+	c := make(chan bool, 1)
+	var x, y int
+	go func() {
+		x = 42
+		c <- true
+	}()
+	x, y = y, x
+	<-c
+}
+
+func TestRaceSliceSlice(t *testing.T) {
+	c := make(chan bool, 1)
+	x := make([]int, 10)
+	go func() {
+		x = make([]int, 20)
+		c <- true
+	}()
+	_ = x[2:3]
+	<-c
+}
+
+func TestRaceSliceSlice2(t *testing.T) {
+	c := make(chan bool, 1)
+	x := make([]int, 10)
+	i := 2
+	go func() {
+		i = 3
+		c <- true
+	}()
+	_ = x[i:4]
+	<-c
+}
+
+func TestRaceSliceString(t *testing.T) {
+	c := make(chan bool, 1)
+	x := "hello"
+	go func() {
+		x = "world"
+		c <- true
+	}()
+	_ = x[2:3]
+	<-c
+}
+
+func TestRaceSliceStruct(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	c := make(chan bool, 1)
+	x := make([]X, 10)
+	go func() {
+		y := make([]X, 10)
+		copy(y, x)
+		c <- true
+	}()
+	x[1].y = 42
+	<-c
+}
+
+func TestRaceAppendSliceStruct(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	c := make(chan bool, 1)
+	x := make([]X, 10)
+	go func() {
+		y := make([]X, 0, 10)
+		y = append(y, x...)
+		c <- true
+	}()
+	x[1].y = 42
+	<-c
+}
+
+func TestRaceStructInd(t *testing.T) {
+	c := make(chan bool, 1)
+	type Item struct {
+		x, y int
+	}
+	i := Item{}
+	go func(p *Item) {
+		*p = Item{}
+		c <- true
+	}(&i)
+	i.y = 42
+	<-c
+}
+
+func TestRaceAsFunc1(t *testing.T) {
+	var s []byte
+	c := make(chan bool, 1)
+	go func() {
+		var err error
+		s, err = func() ([]byte, error) {
+			t := []byte("hello world")
+			return t, nil
+		}()
+		c <- true
+		_ = err
+	}()
+	_ = string(s)
+	<-c
+}
+
+func TestRaceAsFunc2(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	go func() {
+		func(x int) {
+		}(x)
+		c <- true
+	}()
+	x = 42
+	<-c
+}
+
+func TestRaceAsFunc3(t *testing.T) {
+	c := make(chan bool, 1)
+	var mu sync.Mutex
+	x := 0
+	go func() {
+		func(x int) {
+			mu.Lock()
+		}(x) // Read of x must be outside of the mutex.
+		mu.Unlock()
+		c <- true
+	}()
+	mu.Lock()
+	x = 42
+	mu.Unlock()
+	<-c
+}
+
+func TestNoRaceAsFunc4(t *testing.T) {
+	c := make(chan bool, 1)
+	var mu sync.Mutex
+	x := 0
+	go func() {
+		x = func() int { // Write of x must be under the mutex.
+			mu.Lock()
+			return 42
+		}()
+		mu.Unlock()
+		c <- true
+	}()
+	mu.Lock()
+	x = 42
+	mu.Unlock()
+	<-c
+}
+
+func TestRaceHeapParam(t *testing.T) {
+	x := func() (x int) {
+		go func() {
+			x = 42
+		}()
+		return
+	}()
+	_ = x
+}
+
+func TestNoRaceEmptyStruct(t *testing.T) {
+	type Empty struct{}
+	type X struct {
+		y int64
+		Empty
+	}
+	type Y struct {
+		x X
+		y int64
+	}
+	c := make(chan X)
+	var y Y
+	go func() {
+		x := y.x
+		c <- x
+	}()
+	y.y = 42
+	<-c
+}
+
+func TestRaceNestedStruct(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	type Y struct {
+		x X
+	}
+	c := make(chan Y)
+	var y Y
+	go func() {
+		c <- y
+	}()
+	y.x.y = 42
+	<-c
+}
+
+func TestRaceIssue5567(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	in := make(chan []byte)
+	res := make(chan error)
+	go func() {
+		var err error
+		defer func() {
+			close(in)
+			res <- err
+		}()
+		path := "mop_test.go"
+		f, err := os.Open(path)
+		if err != nil {
+			return
+		}
+		defer f.Close()
+		var n, total int
+		b := make([]byte, 17) // the race is on b buffer
+		for err == nil {
+			n, err = f.Read(b)
+			total += n
+			if n > 0 {
+				in <- b[:n]
+			}
+		}
+		if err == io.EOF {
+			err = nil
+		}
+	}()
+	h := sha1.New()
+	for b := range in {
+		h.Write(b)
+	}
+	_ = h.Sum(nil)
+	err := <-res
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestRaceIssue5654(t *testing.T) {
+	text := `Friends, Romans, countrymen, lend me your ears;
+I come to bury Caesar, not to praise him.
+The evil that men do lives after them;
+The good is oft interred with their bones;
+So let it be with Caesar. The noble Brutus
+Hath told you Caesar was ambitious:
+If it were so, it was a grievous fault,
+And grievously hath Caesar answer'd it.
+Here, under leave of Brutus and the rest -
+For Brutus is an honourable man;
+So are they all, all honourable men -
+Come I to speak in Caesar's funeral.
+He was my friend, faithful and just to me:
+But Brutus says he was ambitious;
+And Brutus is an honourable man.`
+
+	data := bytes.NewBufferString(text)
+	in := make(chan []byte)
+
+	go func() {
+		buf := make([]byte, 16)
+		var n int
+		var err error
+		for ; err == nil; n, err = data.Read(buf) {
+			in <- buf[:n]
+		}
+		close(in)
+	}()
+	res := ""
+	for s := range in {
+		res += string(s)
+	}
+	_ = res
+}
+
+type Base int
+
+func (b *Base) Foo() int {
+	return 42
+}
+
+func (b Base) Bar() int {
+	return int(b)
+}
+
+func TestNoRaceMethodThunk(t *testing.T) {
+	type Derived struct {
+		pad int
+		Base
+	}
+	var d Derived
+	done := make(chan bool)
+	go func() {
+		_ = d.Foo()
+		done <- true
+	}()
+	d = Derived{}
+	<-done
+}
+
+func TestRaceMethodThunk(t *testing.T) {
+	type Derived struct {
+		pad int
+		*Base
+	}
+	var d Derived
+	done := make(chan bool)
+	go func() {
+		_ = d.Foo()
+		done <- true
+	}()
+	d = Derived{}
+	<-done
+}
+
+func TestRaceMethodThunk2(t *testing.T) {
+	type Derived struct {
+		pad int
+		Base
+	}
+	var d Derived
+	done := make(chan bool)
+	go func() {
+		_ = d.Bar()
+		done <- true
+	}()
+	d = Derived{}
+	<-done
+}
+
+func TestRaceMethodThunk3(t *testing.T) {
+	type Derived struct {
+		pad int
+		*Base
+	}
+	var d Derived
+	d.Base = new(Base)
+	done := make(chan bool)
+	go func() {
+		_ = d.Bar()
+		done <- true
+	}()
+	d.Base = new(Base)
+	<-done
+}
+
+func TestRaceMethodThunk4(t *testing.T) {
+	type Derived struct {
+		pad int
+		*Base
+	}
+	var d Derived
+	d.Base = new(Base)
+	done := make(chan bool)
+	go func() {
+		_ = d.Bar()
+		done <- true
+	}()
+	*(*int)(d.Base) = 42
+	<-done
+}
+
+func TestNoRaceTinyAlloc(t *testing.T) {
+	const P = 4
+	const N = 1e6
+	var tinySink *byte
+	done := make(chan bool)
+	for p := 0; p < P; p++ {
+		go func() {
+			for i := 0; i < N; i++ {
+				var b byte
+				if b != 0 {
+					tinySink = &b // make it heap allocated
+				}
+				b = 42
+			}
+			done <- true
+		}()
+	}
+	for p := 0; p < P; p++ {
+		<-done
+	}
+}
diff --git a/src/runtime/race/testdata/mutex_test.go b/src/runtime/race/testdata/mutex_test.go
new file mode 100644
index 000000000..3cf03ae6b
--- /dev/null
+++ b/src/runtime/race/testdata/mutex_test.go
@@ -0,0 +1,138 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceMutex(t *testing.T) {
+	var mu sync.Mutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 1
+		ch <- true
+	}()
+	go func() {
+		mu.Lock()
+		x = 2
+		mu.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceMutex(t *testing.T) {
+	var mu sync.Mutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		x = 1
+		mu.Lock()
+		defer mu.Unlock()
+		ch <- true
+	}()
+	go func() {
+		x = 2
+		mu.Lock()
+		mu.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceMutex2(t *testing.T) {
+	var mu1 sync.Mutex
+	var mu2 sync.Mutex
+	var x int8 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		mu1.Lock()
+		defer mu1.Unlock()
+		x = 1
+		ch <- true
+	}()
+	go func() {
+		mu2.Lock()
+		x = 2
+		mu2.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceMutexPureHappensBefore(t *testing.T) {
+	var mu sync.Mutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		x = 1
+		mu.Lock()
+		mu.Unlock()
+		ch <- true
+	}()
+	go func() {
+		<-time.After(1e5)
+		mu.Lock()
+		mu.Unlock()
+		x = 1
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceMutexSemaphore(t *testing.T) {
+	var mu sync.Mutex
+	ch := make(chan bool, 2)
+	x := 0
+	mu.Lock()
+	go func() {
+		x = 1
+		mu.Unlock()
+		ch <- true
+	}()
+	go func() {
+		mu.Lock()
+		x = 2
+		mu.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+// from doc/go_mem.html
+func TestNoRaceMutexExampleFromHtml(t *testing.T) {
+	var l sync.Mutex
+	a := ""
+
+	l.Lock()
+	go func() {
+		a = "hello, world"
+		l.Unlock()
+	}()
+	l.Lock()
+	_ = a
+}
+
+func TestRaceMutexOverwrite(t *testing.T) {
+	c := make(chan bool, 1)
+	var mu sync.Mutex
+	go func() {
+		mu = sync.Mutex{}
+		c <- true
+	}()
+	mu.Lock()
+	<-c
+}
diff --git a/src/runtime/race/testdata/regression_test.go b/src/runtime/race/testdata/regression_test.go
new file mode 100644
index 000000000..d461269d9
--- /dev/null
+++ b/src/runtime/race/testdata/regression_test.go
@@ -0,0 +1,194 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code patterns that caused problems in the past.
+
+package race_test
+
+import (
+	"testing"
+)
+
+type LogImpl struct {
+	x int
+}
+
+func NewLog() (l LogImpl) {
+	c := make(chan bool)
+	go func() {
+		_ = l
+		c <- true
+	}()
+	l = LogImpl{}
+	<-c
+	return
+}
+
+var _ LogImpl = NewLog()
+
+func MakeMap() map[int]int {
+	return make(map[int]int)
+}
+
+func InstrumentMapLen() {
+	_ = len(MakeMap())
+}
+
+func InstrumentMapLen2() {
+	m := make(map[int]map[int]int)
+	_ = len(m[0])
+}
+
+func InstrumentMapLen3() {
+	m := make(map[int]*map[int]int)
+	_ = len(*m[0])
+}
+
+func TestRaceUnaddressableMapLen(t *testing.T) {
+	m := make(map[int]map[int]int)
+	ch := make(chan int, 1)
+	m[0] = make(map[int]int)
+	go func() {
+		_ = len(m[0])
+		ch <- 0
+	}()
+	m[0][0] = 1
+	<-ch
+}
+
+type Rect struct {
+	x, y int
+}
+
+type Image struct {
+	min, max Rect
+}
+
+func NewImage() Image {
+	var pleaseDoNotInlineMe stack
+	pleaseDoNotInlineMe.push(1)
+	_ = pleaseDoNotInlineMe.pop()
+	return Image{}
+}
+
+func AddrOfTemp() {
+	_ = NewImage().min
+}
+
+type TypeID int
+
+func (t *TypeID) encodeType(x int) (tt TypeID, err error) {
+	switch x {
+	case 0:
+		return t.encodeType(x * x)
+	}
+	return 0, nil
+}
+
+type stack []int
+
+func (s *stack) push(x int) {
+	*s = append(*s, x)
+}
+
+func (s *stack) pop() int {
+	i := len(*s)
+	n := (*s)[i-1]
+	*s = (*s)[:i-1]
+	return n
+}
+
+func TestNoRaceStackPushPop(t *testing.T) {
+	var s stack
+	go func(s *stack) {}(&s)
+	s.push(1)
+	x := s.pop()
+	_ = x
+}
+
+type RpcChan struct {
+	c chan bool
+}
+
+var makeChanCalls int
+
+func makeChan() *RpcChan {
+	var pleaseDoNotInlineMe stack
+	pleaseDoNotInlineMe.push(1)
+	_ = pleaseDoNotInlineMe.pop()
+
+	makeChanCalls++
+	c := &RpcChan{make(chan bool, 1)}
+	c.c <- true
+	return c
+}
+
+func call() bool {
+	x := <-makeChan().c
+	return x
+}
+
+func TestNoRaceRpcChan(t *testing.T) {
+	makeChanCalls = 0
+	_ = call()
+	if makeChanCalls != 1 {
+		t.Fatalf("makeChanCalls %d, expected 1\n", makeChanCalls)
+	}
+}
+
+func divInSlice() {
+	v := make([]int64, 10)
+	i := 1
+	_ = v[(i*4)/3]
+}
+
+func TestNoRaceReturn(t *testing.T) {
+	c := make(chan int)
+	noRaceReturn(c)
+	<-c
+}
+
+// Return used to do an implicit a = a, causing a read/write race
+// with the goroutine. Compiler has an optimization to avoid that now.
+// See issue 4014.
+func noRaceReturn(c chan int) (a, b int) {
+	a = 42
+	go func() {
+		_ = a
+		c <- 1
+	}()
+	return a, 10
+}
+
+func issue5431() {
+	var p **inltype
+	if inlinetest(p).x && inlinetest(p).y {
+	} else if inlinetest(p).x || inlinetest(p).y {
+	}
+}
+
+type inltype struct {
+	x, y bool
+}
+
+func inlinetest(p **inltype) *inltype {
+	return *p
+}
+
+type iface interface {
+	Foo() *struct{ b bool }
+}
+
+type Int int
+
+func (i Int) Foo() *struct{ b bool } {
+	return &struct{ b bool }{false}
+}
+
+func TestNoRaceForInfiniteLoop(t *testing.T) {
+	var x Int
+	// interface conversion causes nodes to be put on init list
+	for iface(x).Foo().b {
+	}
+}
diff --git a/src/runtime/race/testdata/rwmutex_test.go b/src/runtime/race/testdata/rwmutex_test.go
new file mode 100644
index 000000000..85cb5df3c
--- /dev/null
+++ b/src/runtime/race/testdata/rwmutex_test.go
@@ -0,0 +1,134 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestRaceMutexRWMutex(t *testing.T) {
+	var mu1 sync.Mutex
+	var mu2 sync.RWMutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		mu1.Lock()
+		defer mu1.Unlock()
+		x = 1
+		ch <- true
+	}()
+	go func() {
+		mu2.Lock()
+		x = 2
+		mu2.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceRWMutex(t *testing.T) {
+	var mu sync.RWMutex
+	var x, y int64 = 0, 1
+	ch := make(chan bool, 2)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 2
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y = x
+		mu.RUnlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceRWMutexMultipleReaders(t *testing.T) {
+	var mu sync.RWMutex
+	var x, y int64 = 0, 1
+	ch := make(chan bool, 3)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 2
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y = x + 1
+		mu.RUnlock()
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y = x + 2
+		mu.RUnlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+	<-ch
+	_ = y
+}
+
+func TestNoRaceRWMutexMultipleReaders(t *testing.T) {
+	var mu sync.RWMutex
+	x := int64(0)
+	ch := make(chan bool, 3)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 2
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y := x + 1
+		_ = y
+		mu.RUnlock()
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y := x + 2
+		_ = y
+		mu.RUnlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+	<-ch
+}
+
+func TestNoRaceRWMutexTransitive(t *testing.T) {
+	var mu sync.RWMutex
+	x := int64(0)
+	ch := make(chan bool, 2)
+	go func() {
+		mu.RLock()
+		_ = x
+		mu.RUnlock()
+		ch <- true
+	}()
+	go func() {
+		time.Sleep(1e7)
+		mu.RLock()
+		_ = x
+		mu.RUnlock()
+		ch <- true
+	}()
+	time.Sleep(2e7)
+	mu.Lock()
+	x = 42
+	mu.Unlock()
+	<-ch
+	<-ch
+}
diff --git a/src/runtime/race/testdata/select_test.go b/src/runtime/race/testdata/select_test.go
new file mode 100644
index 000000000..4a3a23647
--- /dev/null
+++ b/src/runtime/race/testdata/select_test.go
@@ -0,0 +1,286 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestNoRaceSelect1(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+
+	go func() {
+		x = 1
+		// At least two channels are needed because
+		// otherwise the compiler optimizes select out.
+		// See comment in runtime/chan.c:^selectgo.
+		select {
+		case c <- true:
+		case c1 <- true:
+		}
+		compl <- true
+	}()
+	select {
+	case <-c:
+	case c1 <- true:
+	}
+	x = 2
+	<-compl
+}
+
+func TestNoRaceSelect2(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+	go func() {
+		select {
+		case <-c:
+		case <-c1:
+		}
+		x = 1
+		compl <- true
+	}()
+	x = 2
+	close(c)
+	runtime.Gosched()
+	<-compl
+}
+
+func TestNoRaceSelect3(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool, 10)
+	c1 := make(chan bool)
+	go func() {
+		x = 1
+		select {
+		case c <- true:
+		case <-c1:
+		}
+		compl <- true
+	}()
+	<-c
+	x = 2
+	<-compl
+}
+
+func TestNoRaceSelect4(t *testing.T) {
+	type Task struct {
+		f    func()
+		done chan bool
+	}
+
+	queue := make(chan Task)
+	dummy := make(chan bool)
+
+	go func() {
+		for {
+			select {
+			case t := <-queue:
+				t.f()
+				t.done <- true
+			}
+		}
+	}()
+
+	doit := func(f func()) {
+		done := make(chan bool, 1)
+		select {
+		case queue <- Task{f, done}:
+		case <-dummy:
+		}
+		select {
+		case <-done:
+		case <-dummy:
+		}
+	}
+
+	var x int
+	doit(func() {
+		x = 1
+	})
+	_ = x
+}
+
+func TestNoRaceSelect5(t *testing.T) {
+	test := func(sel, needSched bool) {
+		var x int
+		ch := make(chan bool)
+		c1 := make(chan bool)
+
+		done := make(chan bool, 2)
+		go func() {
+			if needSched {
+				runtime.Gosched()
+			}
+			// println(1)
+			x = 1
+			if sel {
+				select {
+				case ch <- true:
+				case <-c1:
+				}
+			} else {
+				ch <- true
+			}
+			done <- true
+		}()
+
+		go func() {
+			// println(2)
+			if sel {
+				select {
+				case <-ch:
+				case <-c1:
+				}
+			} else {
+				<-ch
+			}
+			x = 1
+			done <- true
+		}()
+		<-done
+		<-done
+	}
+
+	test(true, true)
+	test(true, false)
+	test(false, true)
+	test(false, false)
+}
+
+func TestRaceSelect1(t *testing.T) {
+	var x int
+	compl := make(chan bool, 2)
+	c := make(chan bool)
+	c1 := make(chan bool)
+
+	go func() {
+		<-c
+		<-c
+	}()
+	f := func() {
+		select {
+		case c <- true:
+		case c1 <- true:
+		}
+		x = 1
+		compl <- true
+	}
+	go f()
+	go f()
+	<-compl
+	<-compl
+}
+
+func TestRaceSelect2(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+	go func() {
+		x = 1
+		select {
+		case <-c:
+		case <-c1:
+		}
+		compl <- true
+	}()
+	close(c)
+	x = 2
+	<-compl
+}
+
+func TestRaceSelect3(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+	go func() {
+		x = 1
+		select {
+		case c <- true:
+		case c1 <- true:
+		}
+		compl <- true
+	}()
+	x = 2
+	select {
+	case <-c:
+	}
+	<-compl
+}
+
+func TestRaceSelect4(t *testing.T) {
+	done := make(chan bool, 1)
+	var x int
+	go func() {
+		select {
+		default:
+			x = 2
+		}
+		done <- true
+	}()
+	_ = x
+	<-done
+}
+
+// The idea behind this test:
+// there are two variables, access to one
+// of them is synchronized, access to the other
+// is not.
+// Select must (unconditionaly) choose the non-synchronized variable
+// thus causing exactly one race.
+// Currently this test doesn't look like it accomplishes
+// this goal.
+func TestRaceSelect5(t *testing.T) {
+	done := make(chan bool, 1)
+	c1 := make(chan bool, 1)
+	c2 := make(chan bool)
+	var x, y int
+	go func() {
+		select {
+		case c1 <- true:
+			x = 1
+		case c2 <- true:
+			y = 1
+		}
+		done <- true
+	}()
+	_ = x
+	_ = y
+	<-done
+}
+
+// select statements may introduce
+// flakiness: whether this test contains
+// a race depends on the scheduling
+// (some may argue that the code contains
+// this race by definition)
+/*
+func TestFlakyDefault(t *testing.T) {
+	var x int
+	c := make(chan bool, 1)
+	done := make(chan bool, 1)
+	go func() {
+		select {
+		case <-c:
+			x = 2
+		default:
+			x = 3
+		}
+		done <- true
+	}()
+	x = 1
+	c <- true
+	_ = x
+	<-done
+}
+*/
diff --git a/src/runtime/race/testdata/slice_test.go b/src/runtime/race/testdata/slice_test.go
new file mode 100644
index 000000000..5702d1ac8
--- /dev/null
+++ b/src/runtime/race/testdata/slice_test.go
@@ -0,0 +1,485 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"testing"
+)
+
+func TestRaceSliceRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 2)
+	go func() {
+		a[1] = 1
+		ch <- true
+	}()
+	_ = a[1]
+	<-ch
+}
+
+func TestNoRaceSliceRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 2)
+	go func() {
+		a[0] = 1
+		ch <- true
+	}()
+	_ = a[1]
+	<-ch
+}
+
+func TestRaceSliceWW(t *testing.T) {
+	a := make([]int, 10)
+	ch := make(chan bool, 1)
+	go func() {
+		a[1] = 1
+		ch <- true
+	}()
+	a[1] = 2
+	<-ch
+}
+
+func TestNoRaceArrayWW(t *testing.T) {
+	var a [5]int
+	ch := make(chan bool, 1)
+	go func() {
+		a[0] = 1
+		ch <- true
+	}()
+	a[1] = 2
+	<-ch
+}
+
+func TestRaceArrayWW(t *testing.T) {
+	var a [5]int
+	ch := make(chan bool, 1)
+	go func() {
+		a[1] = 1
+		ch <- true
+	}()
+	a[1] = 2
+	<-ch
+}
+
+func TestNoRaceSliceWriteLen(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]bool, 1)
+	go func() {
+		a[0] = true
+		ch <- true
+	}()
+	_ = len(a)
+	<-ch
+}
+
+func TestNoRaceSliceWriteCap(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]uint64, 100)
+	go func() {
+		a[50] = 123
+		ch <- true
+	}()
+	_ = cap(a)
+	<-ch
+}
+
+func TestRaceSliceCopyRead(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		_ = a[5]
+		ch <- true
+	}()
+	copy(a, b)
+	<-ch
+}
+
+func TestNoRaceSliceWriteCopy(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		a[5] = 1
+		ch <- true
+	}()
+	copy(a[:5], b[:5])
+	<-ch
+}
+
+func TestRaceSliceCopyWrite2(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		b[5] = 1
+		ch <- true
+	}()
+	copy(a, b)
+	<-ch
+}
+
+func TestRaceSliceCopyWrite3(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]byte, 10)
+	go func() {
+		a[7] = 1
+		ch <- true
+	}()
+	copy(a, "qwertyqwerty")
+	<-ch
+}
+
+func TestNoRaceSliceCopyRead(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		_ = b[5]
+		ch <- true
+	}()
+	copy(a, b)
+	<-ch
+}
+
+func TestNoRaceSliceWriteSlice2(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]float64, 10)
+	go func() {
+		a[2] = 1.0
+		ch <- true
+	}()
+	_ = a[0:5]
+	<-ch
+}
+
+func TestRaceSliceWriteSlice(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]float64, 10)
+	go func() {
+		a[2] = 1.0
+		ch <- true
+	}()
+	a = a[5:10]
+	<-ch
+}
+
+func TestNoRaceSliceWriteSlice(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]float64, 10)
+	go func() {
+		a[2] = 1.0
+		ch <- true
+	}()
+	_ = a[5:10]
+	<-ch
+}
+
+func TestNoRaceSliceLenCap(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]struct{}, 10)
+	go func() {
+		_ = len(a)
+		ch <- true
+	}()
+	_ = cap(a)
+	<-ch
+}
+
+func TestNoRaceStructSlicesRangeWrite(t *testing.T) {
+	type Str struct {
+		a []int
+		b []int
+	}
+	ch := make(chan bool, 1)
+	var s Str
+	s.a = make([]int, 10)
+	s.b = make([]int, 10)
+	go func() {
+		for range s.a {
+		}
+		ch <- true
+	}()
+	s.b[5] = 5
+	<-ch
+}
+
+func TestRaceSliceDifferent(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	s2 := s
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	// false negative because s2 is PAUTO w/o PHEAP
+	// so we do not instrument it
+	s2[3] = 3
+	<-c
+}
+
+func TestRaceSliceRangeWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	for _, v := range s {
+		_ = v
+	}
+	<-c
+}
+
+func TestNoRaceSliceRangeWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	for range s {
+	}
+	<-c
+}
+
+func TestRaceSliceRangeAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s = append(s, 3)
+		c <- true
+	}()
+	for range s {
+	}
+	<-c
+}
+
+func TestNoRaceSliceRangeAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = append(s, 3)
+		c <- true
+	}()
+	for range s {
+	}
+	<-c
+}
+
+func TestRaceSliceVarWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarRead(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = s[3]
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarRange(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		for range s {
+		}
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = append(s, 10)
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarCopy(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s2 := make([]int, 10)
+		copy(s, s2)
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarCopy2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s2 := make([]int, 10)
+		copy(s2, s)
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10, 20)
+	go func() {
+		_ = append(s, 1)
+		c <- true
+	}()
+	_ = append(s, 2)
+	<-c
+}
+
+func TestRaceSliceAppendWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = append(s, 1)
+		c <- true
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestRaceSliceAppendSlice(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s2 := make([]int, 10)
+		_ = append(s, s2...)
+		c <- true
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestRaceSliceAppendSlice2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	s2foobar := make([]int, 10)
+	go func() {
+		_ = append(s, s2foobar...)
+		c <- true
+	}()
+	s2foobar[5] = 42
+	<-c
+}
+
+func TestRaceSliceAppendString(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]byte, 10)
+	go func() {
+		_ = append(s, "qwerty"...)
+		c <- true
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestNoRaceSliceIndexAccess(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		_ = v
+		c <- true
+	}()
+	s[v] = 1
+	<-c
+}
+
+func TestNoRaceSliceIndexAccess2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		_ = v
+		c <- true
+	}()
+	_ = s[v]
+	<-c
+}
+
+func TestRaceSliceIndexAccess(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		v = 1
+		c <- true
+	}()
+	s[v] = 1
+	<-c
+}
+
+func TestRaceSliceIndexAccess2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		v = 1
+		c <- true
+	}()
+	_ = s[v]
+	<-c
+}
+
+func TestRaceSliceByteToString(t *testing.T) {
+	c := make(chan string)
+	s := make([]byte, 10)
+	go func() {
+		c <- string(s)
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestRaceSliceRuneToString(t *testing.T) {
+	c := make(chan string)
+	s := make([]rune, 10)
+	go func() {
+		c <- string(s)
+	}()
+	s[9] = 42
+	<-c
+}
+
+func TestRaceConcatString(t *testing.T) {
+	s := "hello"
+	c := make(chan string, 1)
+	go func() {
+		c <- s + " world"
+	}()
+	s = "world"
+	<-c
+}
+
+func TestRaceCompareString(t *testing.T) {
+	s1 := "hello"
+	s2 := "world"
+	c := make(chan bool, 1)
+	go func() {
+		c <- s1 == s2
+	}()
+	s1 = s2
+	<-c
+}
diff --git a/src/runtime/race/testdata/sync_test.go b/src/runtime/race/testdata/sync_test.go
new file mode 100644
index 000000000..93af0b1e6
--- /dev/null
+++ b/src/runtime/race/testdata/sync_test.go
@@ -0,0 +1,216 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceCond(t *testing.T) { // tsan's test02
+	ch := make(chan bool, 1)
+	var x int = 0
+	var mu sync.Mutex
+	var cond *sync.Cond = sync.NewCond(&mu)
+	var condition int = 0
+	var waker func()
+	waker = func() {
+		x = 1
+		mu.Lock()
+		condition = 1
+		cond.Signal()
+		mu.Unlock()
+	}
+
+	var waiter func()
+	waiter = func() {
+		go waker()
+		cond.L.Lock()
+		for condition != 1 {
+			cond.Wait()
+		}
+		cond.L.Unlock()
+		x = 2
+		ch <- true
+	}
+	go waiter()
+	<-ch
+}
+
+func TestRaceCond(t *testing.T) { // tsan's test50
+	ch := make(chan bool, 2)
+
+	var x int = 0
+	var mu sync.Mutex
+	var condition int = 0
+	var cond *sync.Cond = sync.NewCond(&mu)
+
+	var waker func() = func() {
+		<-time.After(1e5)
+		x = 1
+		mu.Lock()
+		condition = 1
+		cond.Signal()
+		mu.Unlock()
+		<-time.After(1e5)
+		mu.Lock()
+		x = 3
+		mu.Unlock()
+		ch <- true
+	}
+
+	var waiter func() = func() {
+		mu.Lock()
+		for condition != 1 {
+			cond.Wait()
+		}
+		mu.Unlock()
+		x = 2
+		ch <- true
+	}
+	x = 0
+	go waker()
+	go waiter()
+	<-ch
+	<-ch
+}
+
+// We do not currently automatically
+// parse this test. It is intended that the creation
+// stack is observed manually not to contain
+// off-by-one errors
+func TestRaceAnnounceThreads(t *testing.T) {
+	const N = 7
+	allDone := make(chan bool, N)
+
+	var x int
+
+	var f, g, h func()
+	f = func() {
+		x = 1
+		go g()
+		go func() {
+			x = 1
+			allDone <- true
+		}()
+		x = 2
+		allDone <- true
+	}
+
+	g = func() {
+		for i := 0; i < 2; i++ {
+			go func() {
+				x = 1
+				allDone <- true
+			}()
+			allDone <- true
+		}
+	}
+
+	h = func() {
+		x = 1
+		x = 2
+		go f()
+		allDone <- true
+	}
+
+	go h()
+
+	for i := 0; i < N; i++ {
+		<-allDone
+	}
+}
+
+func TestNoRaceAfterFunc1(t *testing.T) {
+	i := 2
+	c := make(chan bool)
+	var f func()
+	f = func() {
+		i--
+		if i >= 0 {
+			time.AfterFunc(0, f)
+		} else {
+			c <- true
+		}
+	}
+
+	time.AfterFunc(0, f)
+	<-c
+}
+
+func TestNoRaceAfterFunc2(t *testing.T) {
+	var x int
+	timer := time.AfterFunc(10, func() {
+		x = 1
+	})
+	defer timer.Stop()
+	_ = x
+}
+
+func TestNoRaceAfterFunc3(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	time.AfterFunc(1e7, func() {
+		x = 1
+		c <- true
+	})
+	<-c
+}
+
+func TestRaceAfterFunc3(t *testing.T) {
+	c := make(chan bool, 2)
+	x := 0
+	time.AfterFunc(1e7, func() {
+		x = 1
+		c <- true
+	})
+	time.AfterFunc(2e7, func() {
+		x = 2
+		c <- true
+	})
+	<-c
+	<-c
+}
+
+// This test's output is intended to be
+// observed manually. One should check
+// that goroutine creation stack is
+// comprehensible.
+func TestRaceGoroutineCreationStack(t *testing.T) {
+	var x int
+	var ch = make(chan bool, 1)
+
+	f1 := func() {
+		x = 1
+		ch <- true
+	}
+	f2 := func() { go f1() }
+	f3 := func() { go f2() }
+	f4 := func() { go f3() }
+
+	go f4()
+	x = 2
+	<-ch
+}
+
+// A nil pointer in a mutex method call should not
+// corrupt the race detector state.
+// Used to hang indefinitely.
+func TestNoRaceNilMutexCrash(t *testing.T) {
+	var mutex sync.Mutex
+	panics := 0
+	defer func() {
+		if x := recover(); x != nil {
+			mutex.Lock()
+			panics++
+			mutex.Unlock()
+		} else {
+			panic("no panic")
+		}
+	}()
+	var othermutex *sync.RWMutex
+	othermutex.RLock()
+}
diff --git a/src/runtime/race/testdata/waitgroup_test.go b/src/runtime/race/testdata/waitgroup_test.go
new file mode 100644
index 000000000..ff152b0ab
--- /dev/null
+++ b/src/runtime/race/testdata/waitgroup_test.go
@@ -0,0 +1,352 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceWaitGroup(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	n := 1
+	for i := 0; i < n; i++ {
+		wg.Add(1)
+		j := i
+		go func() {
+			x = j
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func TestRaceWaitGroup(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	n := 2
+	for i := 0; i < n; i++ {
+		wg.Add(1)
+		j := i
+		go func() {
+			x = j
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func TestNoRaceWaitGroup2(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		x = 1
+		wg.Done()
+	}()
+	wg.Wait()
+	x = 2
+}
+
+// incrementing counter in Add and locking wg's mutex
+func TestRaceWaitGroupAsMutex(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	c := make(chan bool, 2)
+	go func() {
+		wg.Wait()
+		time.Sleep(100 * time.Millisecond)
+		wg.Add(+1)
+		x = 1
+		wg.Add(-1)
+		c <- true
+	}()
+	go func() {
+		wg.Wait()
+		time.Sleep(100 * time.Millisecond)
+		wg.Add(+1)
+		x = 2
+		wg.Add(-1)
+		c <- true
+	}()
+	<-c
+	<-c
+}
+
+// Incorrect usage: Add is too late.
+func TestRaceWaitGroupWrongWait(t *testing.T) {
+	c := make(chan bool, 2)
+	var x int
+	var wg sync.WaitGroup
+	go func() {
+		wg.Add(1)
+		runtime.Gosched()
+		x = 1
+		wg.Done()
+		c <- true
+	}()
+	go func() {
+		wg.Add(1)
+		runtime.Gosched()
+		x = 2
+		wg.Done()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+func TestRaceWaitGroupWrongAdd(t *testing.T) {
+	c := make(chan bool, 2)
+	var wg sync.WaitGroup
+	go func() {
+		wg.Add(1)
+		time.Sleep(100 * time.Millisecond)
+		wg.Done()
+		c <- true
+	}()
+	go func() {
+		wg.Add(1)
+		time.Sleep(100 * time.Millisecond)
+		wg.Done()
+		c <- true
+	}()
+	time.Sleep(50 * time.Millisecond)
+	wg.Wait()
+	<-c
+	<-c
+}
+
+func TestNoRaceWaitGroupMultipleWait(t *testing.T) {
+	c := make(chan bool, 2)
+	var wg sync.WaitGroup
+	go func() {
+		wg.Wait()
+		c <- true
+	}()
+	go func() {
+		wg.Wait()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+func TestNoRaceWaitGroupMultipleWait2(t *testing.T) {
+	c := make(chan bool, 2)
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		wg.Done()
+		wg.Wait()
+		c <- true
+	}()
+	go func() {
+		wg.Done()
+		wg.Wait()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+func TestNoRaceWaitGroupMultipleWait3(t *testing.T) {
+	const P = 3
+	var data [P]int
+	done := make(chan bool, P)
+	var wg sync.WaitGroup
+	wg.Add(P)
+	for p := 0; p < P; p++ {
+		go func(p int) {
+			data[p] = 42
+			wg.Done()
+		}(p)
+	}
+	for p := 0; p < P; p++ {
+		go func() {
+			wg.Wait()
+			for p1 := 0; p1 < P; p1++ {
+				_ = data[p1]
+			}
+			done <- true
+		}()
+	}
+	for p := 0; p < P; p++ {
+		<-done
+	}
+}
+
+// Correct usage but still a race
+func TestRaceWaitGroup2(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		x = 1
+		wg.Done()
+	}()
+	go func() {
+		x = 2
+		wg.Done()
+	}()
+	wg.Wait()
+}
+
+func TestNoRaceWaitGroupPanicRecover(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	defer func() {
+		err := recover()
+		if err != "sync: negative WaitGroup counter" {
+			t.Fatalf("Unexpected panic: %#v", err)
+		}
+		x = 2
+	}()
+	x = 1
+	wg.Add(-1)
+}
+
+// TODO: this is actually a panic-synchronization test, not a
+// WaitGroup test. Move it to another *_test file
+// Is it possible to get a race by synchronization via panic?
+func TestNoRaceWaitGroupPanicRecover2(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	ch := make(chan bool, 1)
+	var f func() = func() {
+		x = 2
+		ch <- true
+	}
+	go func() {
+		defer func() {
+			err := recover()
+			if err != "sync: negative WaitGroup counter" {
+			}
+			go f()
+		}()
+		x = 1
+		wg.Add(-1)
+	}()
+
+	<-ch
+}
+
+func TestNoRaceWaitGroupTransitive(t *testing.T) {
+	x, y := 0, 0
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		x = 42
+		wg.Done()
+	}()
+	go func() {
+		time.Sleep(1e7)
+		y = 42
+		wg.Done()
+	}()
+	wg.Wait()
+	_ = x
+	_ = y
+}
+
+func TestNoRaceWaitGroupReuse(t *testing.T) {
+	const P = 3
+	var data [P]int
+	var wg sync.WaitGroup
+	for try := 0; try < 3; try++ {
+		wg.Add(P)
+		for p := 0; p < P; p++ {
+			go func(p int) {
+				data[p]++
+				wg.Done()
+			}(p)
+		}
+		wg.Wait()
+		for p := 0; p < P; p++ {
+			data[p]++
+		}
+	}
+}
+
+func TestNoRaceWaitGroupReuse2(t *testing.T) {
+	const P = 3
+	var data [P]int
+	var wg sync.WaitGroup
+	for try := 0; try < 3; try++ {
+		wg.Add(P)
+		for p := 0; p < P; p++ {
+			go func(p int) {
+				data[p]++
+				wg.Done()
+			}(p)
+		}
+		done := make(chan bool)
+		go func() {
+			wg.Wait()
+			for p := 0; p < P; p++ {
+				data[p]++
+			}
+			done <- true
+		}()
+		wg.Wait()
+		<-done
+		for p := 0; p < P; p++ {
+			data[p]++
+		}
+	}
+}
+
+func TestRaceWaitGroupReuse(t *testing.T) {
+	const P = 3
+	const T = 3
+	done := make(chan bool, T)
+	var wg sync.WaitGroup
+	for try := 0; try < T; try++ {
+		var data [P]int
+		wg.Add(P)
+		for p := 0; p < P; p++ {
+			go func(p int) {
+				time.Sleep(50 * time.Millisecond)
+				data[p]++
+				wg.Done()
+			}(p)
+		}
+		go func() {
+			wg.Wait()
+			for p := 0; p < P; p++ {
+				data[p]++
+			}
+			done <- true
+		}()
+		time.Sleep(100 * time.Millisecond)
+		wg.Wait()
+	}
+	for try := 0; try < T; try++ {
+		<-done
+	}
+}
+
+func TestNoRaceWaitGroupConcurrentAdd(t *testing.T) {
+	const P = 4
+	waiting := make(chan bool, P)
+	var wg sync.WaitGroup
+	for p := 0; p < P; p++ {
+		go func() {
+			wg.Add(1)
+			waiting <- true
+			wg.Done()
+		}()
+	}
+	for p := 0; p < P; p++ {
+		<-waiting
+	}
+	wg.Wait()
+}
diff --git a/src/runtime/race0.go b/src/runtime/race0.go
new file mode 100644
index 000000000..5d90cc859
--- /dev/null
+++ b/src/runtime/race0.go
@@ -0,0 +1,37 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+// Dummy race detection API, used when not built with -race.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const raceenabled = false
+
+// Because raceenabled is false, none of these functions should be called.
+
+func raceReadObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr)  { gothrow("race") }
+func raceWriteObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { gothrow("race") }
+func raceinit()                                                             { gothrow("race") }
+func racefini()                                                             { gothrow("race") }
+func racemapshadow(addr unsafe.Pointer, size uintptr)                       { gothrow("race") }
+func racewritepc(addr unsafe.Pointer, callerpc, pc uintptr)                 { gothrow("race") }
+func racereadpc(addr unsafe.Pointer, callerpc, pc uintptr)                  { gothrow("race") }
+func racereadrangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)         { gothrow("race") }
+func racewriterangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)        { gothrow("race") }
+func raceacquire(addr unsafe.Pointer)                                       { gothrow("race") }
+func raceacquireg(gp *g, addr unsafe.Pointer)                               { gothrow("race") }
+func racerelease(addr unsafe.Pointer)                                       { gothrow("race") }
+func racereleaseg(gp *g, addr unsafe.Pointer)                               { gothrow("race") }
+func racereleasemerge(addr unsafe.Pointer)                                  { gothrow("race") }
+func racereleasemergeg(gp *g, addr unsafe.Pointer)                          { gothrow("race") }
+func racefingo()                                                            { gothrow("race") }
+func racemalloc(p unsafe.Pointer, sz uintptr)                               { gothrow("race") }
+func racegostart(pc uintptr) uintptr                                        { gothrow("race"); return 0 }
+func racegoend()                                                            { gothrow("race") }
diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s
new file mode 100644
index 000000000..a96d9de12
--- /dev/null
+++ b/src/runtime/race_amd64.s
@@ -0,0 +1,414 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// The following thunks allow calling the gcc-compiled race runtime directly
+// from Go code without going all the way through cgo.
+// First, it's much faster (up to 50% speedup for real Go programs).
+// Second, it eliminates race-related special cases from cgocall and scheduler.
+// Third, in long-term it will allow to remove cyclic runtime/race dependency on cmd/go.
+
+// A brief recap of the amd64 calling convention.
+// Arguments are passed in DI, SI, DX, CX, R8, R9, the rest is on stack.
+// Callee-saved registers are: BX, BP, R12-R15.
+// SP must be 16-byte aligned.
+// On Windows:
+// Arguments are passed in CX, DX, R8, R9, the rest is on stack.
+// Callee-saved registers are: BX, BP, DI, SI, R12-R15.
+// SP must be 16-byte aligned. Windows also requires "stack-backing" for the 4 register arguments:
+// http://msdn.microsoft.com/en-us/library/ms235286.aspx
+// We do not do this, because it seems to be intended for vararg/unprototyped functions.
+// Gcc-compiled race runtime does not try to use that space.
+
+#ifdef GOOS_windows
+#define RARG0 CX
+#define RARG1 DX
+#define RARG2 R8
+#define RARG3 R9
+#else
+#define RARG0 DI
+#define RARG1 SI
+#define RARG2 DX
+#define RARG3 CX
+#endif
+
+// func runtime·raceread(addr uintptr)
+// Called from instrumented code.
+TEXT	runtime·raceread(SB), NOSPLIT, $0-8
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	(SP), RARG2
+	// void __tsan_read(ThreadState *thr, void *addr, void *pc);
+	MOVQ	$__tsan_read(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·RaceRead(addr uintptr)
+TEXT	runtime·RaceRead(SB), NOSPLIT, $0-8
+	// This needs to be a tail call, because raceread reads caller pc.
+	JMP	runtime·raceread(SB)
+
+// void runtime·racereadpc(void *addr, void *callpc, void *pc)
+TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	callpc+8(FP), RARG2
+	MOVQ	pc+16(FP), RARG3
+	// void __tsan_read_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
+	MOVQ	$__tsan_read_pc(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·racewrite(addr uintptr)
+// Called from instrumented code.
+TEXT	runtime·racewrite(SB), NOSPLIT, $0-8
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	(SP), RARG2
+	// void __tsan_write(ThreadState *thr, void *addr, void *pc);
+	MOVQ	$__tsan_write(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·RaceWrite(addr uintptr)
+TEXT	runtime·RaceWrite(SB), NOSPLIT, $0-8
+	// This needs to be a tail call, because racewrite reads caller pc.
+	JMP	runtime·racewrite(SB)
+
+// void runtime·racewritepc(void *addr, void *callpc, void *pc)
+TEXT	runtime·racewritepc(SB), NOSPLIT, $0-24
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	callpc+8(FP), RARG2
+	MOVQ	pc+16(FP), RARG3
+	// void __tsan_write_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
+	MOVQ	$__tsan_write_pc(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·racereadrange(addr, size uintptr)
+// Called from instrumented code.
+TEXT	runtime·racereadrange(SB), NOSPLIT, $0-16
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	size+8(FP), RARG2
+	MOVQ	(SP), RARG3
+	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
+	MOVQ	$__tsan_read_range(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·RaceReadRange(addr, size uintptr)
+TEXT	runtime·RaceReadRange(SB), NOSPLIT, $0-16
+	// This needs to be a tail call, because racereadrange reads caller pc.
+	JMP	runtime·racereadrange(SB)
+
+// void runtime·racereadrangepc1(void *addr, uintptr sz, void *pc)
+TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	size+8(FP), RARG2
+	MOVQ	pc+16(FP), RARG3
+	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
+	MOVQ	$__tsan_read_range(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·racewriterange(addr, size uintptr)
+// Called from instrumented code.
+TEXT	runtime·racewriterange(SB), NOSPLIT, $0-16
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	size+8(FP), RARG2
+	MOVQ	(SP), RARG3
+	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
+	MOVQ	$__tsan_write_range(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// func runtime·RaceWriteRange(addr, size uintptr)
+TEXT	runtime·RaceWriteRange(SB), NOSPLIT, $0-16
+	// This needs to be a tail call, because racewriterange reads caller pc.
+	JMP	runtime·racewriterange(SB)
+
+// void runtime·racewriterangepc1(void *addr, uintptr sz, void *pc)
+TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
+	MOVQ	addr+0(FP), RARG1
+	MOVQ	size+8(FP), RARG2
+	MOVQ	pc+16(FP), RARG3
+	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
+	MOVQ	$__tsan_write_range(SB), AX
+	JMP	racecalladdr<>(SB)
+
+// If addr (RARG1) is out of range, do nothing.
+// Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
+TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
+	CMPQ	RARG1, runtime·racearenastart(SB)
+	JB	racecalladdr_data
+	CMPQ	RARG1, runtime·racearenaend(SB)
+	JB	racecalladdr_call
+racecalladdr_data:
+	CMPQ	RARG1, runtime·racedatastart(SB)
+	JB	racecalladdr_ret
+	CMPQ	RARG1, runtime·racedataend(SB)
+	JAE	racecalladdr_ret
+racecalladdr_call:
+	MOVQ	AX, AX		// w/o this 6a miscompiles this function
+	JMP	racecall<>(SB)
+racecalladdr_ret:
+	RET
+
+// func runtime·racefuncenter(pc uintptr)
+// Called from instrumented code.
+TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
+	MOVQ	DX, R15		// save function entry context (for closures)
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	MOVQ	callpc+0(FP), RARG1
+	// void __tsan_func_enter(ThreadState *thr, void *pc);
+	MOVQ	$__tsan_func_enter(SB), AX
+	// racecall<> preserves R15
+	CALL	racecall<>(SB)
+	MOVQ	R15, DX	// restore function entry context
+	RET
+
+// func runtime·racefuncexit()
+// Called from instrumented code.
+TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	// void __tsan_func_exit(ThreadState *thr);
+	MOVQ	$__tsan_func_exit(SB), AX
+	JMP	racecall<>(SB)
+
+// Atomic operations for sync/atomic package.
+
+// Load
+TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic32_load(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic64_load(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·LoadInt32(SB)
+
+TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·LoadInt64(SB)
+
+TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·LoadInt64(SB)
+
+TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·LoadInt64(SB)
+
+// Store
+TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic32_store(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic64_store(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·StoreInt32(SB)
+
+TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·StoreInt64(SB)
+
+TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·StoreInt64(SB)
+
+TEXT	sync∕atomic·StorePointer(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·StoreInt64(SB)
+
+// Swap
+TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic32_exchange(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic64_exchange(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·SwapInt32(SB)
+
+TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·SwapInt64(SB)
+
+TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·SwapInt64(SB)
+
+TEXT	sync∕atomic·SwapPointer(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·SwapInt64(SB)
+
+// Add
+TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic32_fetch_add(SB), AX
+	CALL	racecallatomic<>(SB)
+	MOVL	add+8(FP), AX	// convert fetch_add to add_fetch
+	ADDL	AX, ret+16(FP)
+	RET
+
+TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic64_fetch_add(SB), AX
+	CALL	racecallatomic<>(SB)
+	MOVQ	add+8(FP), AX	// convert fetch_add to add_fetch
+	ADDQ	AX, ret+16(FP)
+	RET
+
+TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·AddInt32(SB)
+
+TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·AddInt64(SB)
+
+TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·AddInt64(SB)
+
+TEXT	sync∕atomic·AddPointer(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·AddInt64(SB)
+
+// CompareAndSwap
+TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic32_compare_exchange(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-0
+	MOVQ	$__tsan_go_atomic64_compare_exchange(SB), AX
+	CALL	racecallatomic<>(SB)
+	RET
+
+TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·CompareAndSwapInt32(SB)
+
+TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·CompareAndSwapInt64(SB)
+
+TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·CompareAndSwapInt64(SB)
+
+TEXT	sync∕atomic·CompareAndSwapPointer(SB), NOSPLIT, $0-0
+	JMP	sync∕atomic·CompareAndSwapInt64(SB)
+
+// Generic atomic operation implementation.
+// AX already contains target function.
+TEXT	racecallatomic<>(SB), NOSPLIT, $0-0
+	// Trigger SIGSEGV early.
+	MOVQ	16(SP), R12
+	MOVL	(R12), R13
+	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
+	CMPQ	R12, runtime·racearenastart(SB)
+	JB	racecallatomic_data
+	CMPQ	R12, runtime·racearenaend(SB)
+	JB	racecallatomic_ok
+racecallatomic_data:
+	CMPQ	R12, runtime·racedatastart(SB)
+	JB	racecallatomic_ignore
+	CMPQ	R12, runtime·racedataend(SB)
+	JAE	racecallatomic_ignore
+racecallatomic_ok:
+	// Addr is within the good range, call the atomic function.
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	MOVQ	8(SP), RARG1	// caller pc
+	MOVQ	(SP), RARG2	// pc
+	LEAQ	16(SP), RARG3	// arguments
+	JMP	racecall<>(SB)	// does not return
+racecallatomic_ignore:
+	// Addr is outside the good range.
+	// Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op.
+	// An attempt to synchronize on the address would cause crash.
+	MOVQ	AX, R15	// remember the original function
+	MOVQ	$__tsan_go_ignore_sync_begin(SB), AX
+	MOVQ	g(R12), R14
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	CALL	racecall<>(SB)
+	MOVQ	R15, AX	// restore the original function
+	// Call the atomic function.
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	MOVQ	8(SP), RARG1	// caller pc
+	MOVQ	(SP), RARG2	// pc
+	LEAQ	16(SP), RARG3	// arguments
+	CALL	racecall<>(SB)
+	// Call __tsan_go_ignore_sync_end.
+	MOVQ	$__tsan_go_ignore_sync_end(SB), AX
+	MOVQ	g_racectx(R14), RARG0	// goroutine context
+	JMP	racecall<>(SB)
+
+// void runtime·racecall(void(*f)(...), ...)
+// Calls C function f from race runtime and passes up to 4 arguments to it.
+// The arguments are never heap-object-preserving pointers, so we pretend there are no arguments.
+TEXT	runtime·racecall(SB), NOSPLIT, $0-0
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg0+8(FP), RARG0
+	MOVQ	arg1+16(FP), RARG1
+	MOVQ	arg2+24(FP), RARG2
+	MOVQ	arg3+32(FP), RARG3
+	JMP	racecall<>(SB)
+
+// Switches SP to g0 stack and calls (AX). Arguments already set.
+TEXT	racecall<>(SB), NOSPLIT, $0-0
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_m(R14), R13
+	// Switch to g0 stack.
+	MOVQ	SP, R12		// callee-saved, preserved across the CALL
+	MOVQ	m_g0(R13), R10
+	CMPQ	R10, R14
+	JE	racecall_cont	// already on g0
+	MOVQ	(g_sched+gobuf_sp)(R10), SP
+racecall_cont:
+	ANDQ	$~15, SP	// alignment for gcc ABI
+	CALL	AX
+	MOVQ	R12, SP
+	RET
+
+// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
+// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
+// The overall effect of Go->C->Go call chain is similar to that of mcall.
+TEXT	runtime·racesymbolizethunk(SB), NOSPLIT, $56-8
+	// Save callee-saved registers (Go code won't respect that).
+	// This is superset of darwin/linux/windows registers.
+	PUSHQ	BX
+	PUSHQ	BP
+	PUSHQ	DI
+	PUSHQ	SI
+	PUSHQ	R12
+	PUSHQ	R13
+	PUSHQ	R14
+	PUSHQ	R15
+	// Set g = g0.
+	get_tls(R12)
+	MOVQ	g(R12), R13
+	MOVQ	g_m(R13), R13
+	MOVQ	m_g0(R13), R14
+	MOVQ	R14, g(R12)	// g = m->g0
+	MOVQ	RARG0, 0(SP)	// func arg
+	CALL	runtime·racesymbolize(SB)
+	// All registers are smashed after Go code, reload.
+	get_tls(R12)
+	MOVQ	g(R12), R13
+	MOVQ	g_m(R13), R13
+	MOVQ	m_curg(R13), R14
+	MOVQ	R14, g(R12)	// g = m->curg
+	// Restore callee-saved registers.
+	POPQ	R15
+	POPQ	R14
+	POPQ	R13
+	POPQ	R12
+	POPQ	SI
+	POPQ	DI
+	POPQ	BP
+	POPQ	BX
+	RET
diff --git a/src/runtime/rdebug.go b/src/runtime/rdebug.go
new file mode 100644
index 000000000..e5e691122
--- /dev/null
+++ b/src/runtime/rdebug.go
@@ -0,0 +1,37 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func setMaxStack(in int) (out int) {
+	out = int(maxstacksize)
+	maxstacksize = uintptr(in)
+	return out
+}
+
+func setGCPercent(in int32) (out int32) {
+	mp := acquirem()
+	mp.scalararg[0] = uintptr(int(in))
+	onM(setgcpercent_m)
+	out = int32(int(mp.scalararg[0]))
+	releasem(mp)
+	return out
+}
+
+func setPanicOnFault(new bool) (old bool) {
+	mp := acquirem()
+	old = mp.curg.paniconfault
+	mp.curg.paniconfault = new
+	releasem(mp)
+	return old
+}
+
+func setMaxThreads(in int) (out int) {
+	mp := acquirem()
+	mp.scalararg[0] = uintptr(in)
+	onM(setmaxthreads_m)
+	out = int(mp.scalararg[0])
+	releasem(mp)
+	return out
+}
diff --git a/src/runtime/rt0_android_arm.s b/src/runtime/rt0_android_arm.s
new file mode 100644
index 000000000..6b65fb47b
--- /dev/null
+++ b/src/runtime/rt0_android_arm.s
@@ -0,0 +1,11 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_arm_android(SB),NOSPLIT,$-4
+	MOVW		(R13), R0      // argc
+	MOVW		$4(R13), R1    // argv
+	MOVW		$_rt0_arm_linux1(SB), R4
+	B		(R4)
diff --git a/src/runtime/rt0_darwin_386.s b/src/runtime/rt0_darwin_386.s
new file mode 100644
index 000000000..4c8c92dda
--- /dev/null
+++ b/src/runtime/rt0_darwin_386.s
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_darwin(SB),NOSPLIT,$8
+	MOVL	8(SP), AX
+	LEAL	12(SP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_darwin_amd64.s b/src/runtime/rt0_darwin_amd64.s
new file mode 100644
index 000000000..452d85455
--- /dev/null
+++ b/src/runtime/rt0_darwin_amd64.s
@@ -0,0 +1,15 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_darwin(SB),NOSPLIT,$-8
+	LEAQ	8(SP), SI // argv
+	MOVQ	0(SP), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
diff --git a/src/runtime/rt0_dragonfly_386.s b/src/runtime/rt0_dragonfly_386.s
new file mode 100644
index 000000000..548ba796a
--- /dev/null
+++ b/src/runtime/rt0_dragonfly_386.s
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_dragonfly(SB),NOSPLIT,$8
+	MOVL	8(SP), AX
+	LEAL	12(SP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_dragonfly_amd64.s b/src/runtime/rt0_dragonfly_amd64.s
new file mode 100644
index 000000000..fb56618d8
--- /dev/null
+++ b/src/runtime/rt0_dragonfly_amd64.s
@@ -0,0 +1,15 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_dragonfly(SB),NOSPLIT,$-8
+	LEAQ	8(DI), SI // argv
+	MOVQ	0(DI), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
diff --git a/src/runtime/rt0_freebsd_386.s b/src/runtime/rt0_freebsd_386.s
new file mode 100644
index 000000000..cd7a915f8
--- /dev/null
+++ b/src/runtime/rt0_freebsd_386.s
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_freebsd(SB),NOSPLIT,$8
+	MOVL	8(SP), AX
+	LEAL	12(SP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_freebsd_amd64.s b/src/runtime/rt0_freebsd_amd64.s
new file mode 100644
index 000000000..7989f7c3e
--- /dev/null
+++ b/src/runtime/rt0_freebsd_amd64.s
@@ -0,0 +1,15 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_freebsd(SB),NOSPLIT,$-8
+	LEAQ	8(DI), SI // argv
+	MOVQ	0(DI), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
diff --git a/src/runtime/rt0_freebsd_arm.s b/src/runtime/rt0_freebsd_arm.s
new file mode 100644
index 000000000..f31252698
--- /dev/null
+++ b/src/runtime/rt0_freebsd_arm.s
@@ -0,0 +1,18 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// FreeBSD and Linux use the same linkage to main
+
+TEXT _rt0_arm_freebsd(SB),NOSPLIT,$-4
+	MOVW	(R13), R0	// argc
+	MOVW	$4(R13), R1		// argv
+	MOVM.DB.W [R0-R1], (R13)
+	B	runtime·rt0_go(SB)
+
+TEXT main(SB),NOSPLIT,$-4
+	MOVM.DB.W [R0-R1], (R13)
+	MOVW	$runtime·rt0_go(SB), R4
+	B		(R4)
diff --git a/src/runtime/rt0_linux_386.s b/src/runtime/rt0_linux_386.s
new file mode 100644
index 000000000..352e594d5
--- /dev/null
+++ b/src/runtime/rt0_linux_386.s
@@ -0,0 +1,25 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_linux(SB),NOSPLIT,$8
+	MOVL	8(SP), AX
+	LEAL	12(SP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	runtime·linux_setup_vdso(SB)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
+
+TEXT _fallback_vdso(SB),NOSPLIT,$0
+	INT	$0x80
+	RET
+
+DATA	runtime·_vdso(SB)/4, $_fallback_vdso(SB)
+GLOBL	runtime·_vdso(SB), NOPTR, $4
+
diff --git a/src/runtime/rt0_linux_amd64.s b/src/runtime/rt0_linux_amd64.s
new file mode 100644
index 000000000..985426acc
--- /dev/null
+++ b/src/runtime/rt0_linux_amd64.s
@@ -0,0 +1,15 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_linux(SB),NOSPLIT,$-8
+	LEAQ	8(SP), SI // argv
+	MOVQ	0(SP), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
new file mode 100644
index 000000000..5f521d24b
--- /dev/null
+++ b/src/runtime/rt0_linux_arm.s
@@ -0,0 +1,91 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_arm_linux(SB),NOSPLIT,$-4
+	MOVW	(R13), R0	// argc
+	MOVW	$4(R13), R1		// argv
+	MOVW	$_rt0_arm_linux1(SB), R4
+	B		(R4)
+
+TEXT _rt0_arm_linux1(SB),NOSPLIT,$-4
+	// We first need to detect the kernel ABI, and warn the user
+	// if the system only supports OABI
+	// The strategy here is to call some EABI syscall to see if
+	// SIGILL is received.
+	// To catch SIGILL, we have to first setup sigaction, this is
+	// a chicken-and-egg problem, because we can't do syscall if
+	// we don't know the kernel ABI... Oh, not really, we can do
+	// syscall in Thumb mode.
+
+	// Save argc and argv
+	MOVM.DB.W [R0-R1], (R13)
+
+	// Thumb mode OABI check disabled because there are some
+	// EABI systems that do not support Thumb execution.
+	// We can run on them except for this check!
+
+	// // set up sa_handler
+	// MOVW	$bad_abi<>(SB), R0 // sa_handler
+	// MOVW	$0, R1 // sa_flags
+	// MOVW	$0, R2 // sa_restorer
+	// MOVW	$0, R3 // sa_mask
+	// MOVM.DB.W [R0-R3], (R13)
+	// MOVW	$4, R0 // SIGILL
+	// MOVW	R13, R1 // sa
+	// SUB	$16, R13
+	// MOVW	R13, R2 // old_sa
+	// MOVW	$8, R3 // c
+	// MOVW	$174, R7 // sys_sigaction
+	// BL	oabi_syscall<>(SB)
+
+	// do an EABI syscall
+	MOVW	$20, R7 // sys_getpid
+	SWI	$0 // this will trigger SIGILL on OABI systems
+	
+	// MOVW	$4, R0  // SIGILL
+	// MOVW	R13, R1 // sa
+	// MOVW	$0, R2 // old_sa
+	// MOVW	$8, R3 // c
+	// MOVW	$174, R7 // sys_sigaction
+	// SWI	$0 // restore signal handler
+	// ADD	$32, R13
+
+	SUB	$4, R13 // fake a stack frame for runtime·setup_auxv
+	BL	runtime·setup_auxv(SB)
+	ADD	$4, R13
+	B	runtime·rt0_go(SB)
+
+TEXT bad_abi<>(SB),NOSPLIT,$-4
+	// give diagnosis and exit
+	MOVW	$2, R0 // stderr
+	MOVW	$bad_abi_msg(SB), R1 // data
+	MOVW	$45, R2 // len
+	MOVW	$4, R7 // sys_write
+	BL	oabi_syscall<>(SB)
+	MOVW	$1, R0
+	MOVW	$1, R7 // sys_exit
+	BL	oabi_syscall<>(SB)
+	B  	0(PC)
+
+DATA bad_abi_msg+0x00(SB)/8, $"This pro"
+DATA bad_abi_msg+0x08(SB)/8, $"gram can"
+DATA bad_abi_msg+0x10(SB)/8, $" only be"
+DATA bad_abi_msg+0x18(SB)/8, $" run on "
+DATA bad_abi_msg+0x20(SB)/8, $"EABI ker"
+DATA bad_abi_msg+0x28(SB)/4, $"nels"
+DATA bad_abi_msg+0x2c(SB)/1, $0xa
+GLOBL bad_abi_msg(SB), RODATA, $45
+
+TEXT oabi_syscall<>(SB),NOSPLIT,$-4
+	ADD $1, PC, R4
+	WORD $0xe12fff14 //BX	(R4) // enter thumb mode
+	// TODO(minux): only supports little-endian CPUs
+	WORD $0x4770df01 // swi $1; bx lr
+
+TEXT main(SB),NOSPLIT,$-4
+	MOVW	$_rt0_arm_linux1(SB), R4
+	B		(R4)
+
diff --git a/src/runtime/rt0_nacl_386.s b/src/runtime/rt0_nacl_386.s
new file mode 100644
index 000000000..d4ba06306
--- /dev/null
+++ b/src/runtime/rt0_nacl_386.s
@@ -0,0 +1,22 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// NaCl entry has:
+//	0(FP) - arg block == SP+8
+//	4(FP) - cleanup function pointer, always 0
+//	8(FP) - envc
+//	12(FP) - argc
+//	16(FP) - argv, then 0, then envv, then 0, then auxv
+TEXT _rt0_386_nacl(SB),NOSPLIT,$8
+	MOVL	argc+12(FP), AX
+	LEAL	argv+16(FP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_nacl_amd64p32.s b/src/runtime/rt0_nacl_amd64p32.s
new file mode 100644
index 000000000..54e4b1de8
--- /dev/null
+++ b/src/runtime/rt0_nacl_amd64p32.s
@@ -0,0 +1,30 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// NaCl entry on 32-bit x86 has DI pointing at the arg block, which contains:
+//
+//	0(DI) - cleanup function pointer, always 0
+//	4(DI) - envc
+//	8(DI) - argc
+//	12(DI) - argv, then 0, then envv, then 0, then auxv
+// NaCl entry here is almost the same, except that there
+// is no saved caller PC, so 0(FP) is -8(FP) and so on. 
+TEXT _rt0_amd64p32_nacl(SB),NOSPLIT,$16
+	MOVL	DI, 0(SP)
+	CALL	runtime·nacl_sysinfo(SB)
+	MOVL	0(SP), DI
+	MOVL	8(DI), AX
+	LEAL	12(DI), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	// Uncomment for fake time like on Go Playground.
+	//MOVQ	$1257894000000000000, AX
+	//MOVQ	AX, runtime·faketime(SB)
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_nacl_arm.s b/src/runtime/rt0_nacl_arm.s
new file mode 100644
index 000000000..eadb4782d
--- /dev/null
+++ b/src/runtime/rt0_nacl_arm.s
@@ -0,0 +1,20 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// NaCl entry has:
+//	0(FP) - 0
+//	4(FP) - cleanup function pointer, always 0
+//	8(FP) - envc
+//	12(FP) - argc
+//	16(FP) - argv, then 0, then envv, then 0, then auxv
+TEXT _rt0_arm_nacl(SB),NOSPLIT,$-4
+	MOVW	8(R13), R0
+	MOVW	$12(R13), R1
+	MOVM.DB.W [R0-R1], (R13)
+	B	main(SB)
+
+TEXT main(SB),NOSPLIT,$0
+	B	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_netbsd_386.s b/src/runtime/rt0_netbsd_386.s
new file mode 100644
index 000000000..70b853253
--- /dev/null
+++ b/src/runtime/rt0_netbsd_386.s
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_netbsd(SB),NOSPLIT,$8
+	MOVL	8(SP), AX
+	LEAL	12(SP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_netbsd_amd64.s b/src/runtime/rt0_netbsd_amd64.s
new file mode 100644
index 000000000..fad56614e
--- /dev/null
+++ b/src/runtime/rt0_netbsd_amd64.s
@@ -0,0 +1,15 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_netbsd(SB),NOSPLIT,$-8
+	LEAQ	8(SP), SI // argv
+	MOVQ	0(SP), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
diff --git a/src/runtime/rt0_netbsd_arm.s b/src/runtime/rt0_netbsd_arm.s
new file mode 100644
index 000000000..bad66e06c
--- /dev/null
+++ b/src/runtime/rt0_netbsd_arm.s
@@ -0,0 +1,13 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// FreeBSD/NetBSD and Linux use the same linkage to main
+
+TEXT _rt0_arm_netbsd(SB),NOSPLIT,$-4
+	MOVW	(R13), R0	// argc
+	MOVW	$4(R13), R1		// argv
+	MOVM.DB.W [R0-R1], (R13)
+	B runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_openbsd_386.s b/src/runtime/rt0_openbsd_386.s
new file mode 100644
index 000000000..f25d2e1cf
--- /dev/null
+++ b/src/runtime/rt0_openbsd_386.s
@@ -0,0 +1,16 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_openbsd(SB),NOSPLIT,$8
+	MOVL	8(SP), AX
+	LEAL	12(SP), BX
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_openbsd_amd64.s b/src/runtime/rt0_openbsd_amd64.s
new file mode 100644
index 000000000..58fe66639
--- /dev/null
+++ b/src/runtime/rt0_openbsd_amd64.s
@@ -0,0 +1,15 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_openbsd(SB),NOSPLIT,$-8
+	LEAQ	8(SP), SI // argv
+	MOVQ	0(SP), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
diff --git a/src/runtime/rt0_plan9_386.s b/src/runtime/rt0_plan9_386.s
new file mode 100644
index 000000000..c451299ee
--- /dev/null
+++ b/src/runtime/rt0_plan9_386.s
@@ -0,0 +1,23 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_plan9(SB),NOSPLIT,$12
+	MOVL	AX, _tos(SB)
+	LEAL	8(SP), AX
+	MOVL	AX, _privates(SB)
+	MOVL	$1, _nprivates(SB)
+	CALL	runtime·asminit(SB)
+	MOVL	inargc-4(FP), AX
+	MOVL	AX, 0(SP)
+	LEAL	inargv+0(FP), AX
+	MOVL	AX, 4(SP)
+	CALL	runtime·rt0_go(SB)
+
+DATA  runtime·isplan9(SB)/4, $1
+GLOBL runtime·isplan9(SB), NOPTR, $4
+GLOBL _tos(SB), NOPTR, $4
+GLOBL _privates(SB), NOPTR, $4
+GLOBL _nprivates(SB), NOPTR, $4
diff --git a/src/runtime/rt0_plan9_amd64.s b/src/runtime/rt0_plan9_amd64.s
new file mode 100644
index 000000000..ec2d9ec82
--- /dev/null
+++ b/src/runtime/rt0_plan9_amd64.s
@@ -0,0 +1,21 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_plan9(SB),NOSPLIT,$24
+	MOVQ	AX, _tos(SB)
+	LEAQ	16(SP), AX
+	MOVQ	AX, _privates(SB)
+	MOVL	$1, _nprivates(SB)
+	MOVL	inargc-8(FP), DI
+	LEAQ	inargv+0(FP), SI
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
+
+DATA runtime·isplan9(SB)/4, $1
+GLOBL runtime·isplan9(SB), NOPTR, $4
+GLOBL _tos(SB), NOPTR, $8
+GLOBL _privates(SB), NOPTR, $8
+GLOBL _nprivates(SB), NOPTR, $4
diff --git a/src/runtime/rt0_solaris_amd64.s b/src/runtime/rt0_solaris_amd64.s
new file mode 100644
index 000000000..5997cbf8e
--- /dev/null
+++ b/src/runtime/rt0_solaris_amd64.s
@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_solaris(SB),NOSPLIT,$-8
+	LEAQ	8(SP), SI // argv
+	MOVQ	0(SP), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
+
+DATA runtime·issolaris(SB)/4, $1
+GLOBL runtime·issolaris(SB), NOPTR, $4
diff --git a/src/runtime/rt0_windows_386.s b/src/runtime/rt0_windows_386.s
new file mode 100644
index 000000000..3c2deda90
--- /dev/null
+++ b/src/runtime/rt0_windows_386.s
@@ -0,0 +1,20 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_windows(SB),NOSPLIT,$12
+	MOVL	12(SP), AX
+	LEAL	16(SP), BX
+	MOVL	AX, 4(SP)
+	MOVL	BX, 8(SP)
+	MOVL	$-1, 0(SP) // return PC for main
+	JMP	main(SB)
+
+TEXT main(SB),NOSPLIT,$0
+	JMP	runtime·rt0_go(SB)
+
+
+DATA  runtime·iswindows(SB)/4, $1
+GLOBL runtime·iswindows(SB), NOPTR, $4
diff --git a/src/runtime/rt0_windows_amd64.s b/src/runtime/rt0_windows_amd64.s
new file mode 100644
index 000000000..197f52e11
--- /dev/null
+++ b/src/runtime/rt0_windows_amd64.s
@@ -0,0 +1,19 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+TEXT _rt0_amd64_windows(SB),NOSPLIT,$-8
+	LEAQ	8(SP), SI // argv
+	MOVQ	0(SP), DI // argc
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX
+
+DATA  runtime·iswindows(SB)/4, $1
+GLOBL runtime·iswindows(SB), NOPTR, $4
diff --git a/src/runtime/rune.go b/src/runtime/rune.go
new file mode 100644
index 000000000..a9f683581
--- /dev/null
+++ b/src/runtime/rune.go
@@ -0,0 +1,219 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ *              Portions Copyright 2009 The Go Authors. All rights reserved.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+
+/*
+ * This code is copied, with slight editing due to type differences,
+ * from a subset of ../lib9/utf/rune.c
+ */
+
+package runtime
+
+const (
+	bit1 = 7
+	bitx = 6
+	bit2 = 5
+	bit3 = 4
+	bit4 = 3
+	bit5 = 2
+
+	t1 = ((1 << (bit1 + 1)) - 1) ^ 0xFF /* 0000 0000 */
+	tx = ((1 << (bitx + 1)) - 1) ^ 0xFF /* 1000 0000 */
+	t2 = ((1 << (bit2 + 1)) - 1) ^ 0xFF /* 1100 0000 */
+	t3 = ((1 << (bit3 + 1)) - 1) ^ 0xFF /* 1110 0000 */
+	t4 = ((1 << (bit4 + 1)) - 1) ^ 0xFF /* 1111 0000 */
+	t5 = ((1 << (bit5 + 1)) - 1) ^ 0xFF /* 1111 1000 */
+
+	rune1 = (1 << (bit1 + 0*bitx)) - 1 /* 0000 0000 0111 1111 */
+	rune2 = (1 << (bit2 + 1*bitx)) - 1 /* 0000 0111 1111 1111 */
+	rune3 = (1 << (bit3 + 2*bitx)) - 1 /* 1111 1111 1111 1111 */
+	rune4 = (1 << (bit4 + 3*bitx)) - 1 /* 0001 1111 1111 1111 1111 1111 */
+
+	maskx = (1 << bitx) - 1 /* 0011 1111 */
+	testx = maskx ^ 0xFF    /* 1100 0000 */
+
+	runeerror = 0xFFFD
+	runeself  = 0x80
+
+	surrogateMin = 0xD800
+	surrogateMax = 0xDFFF
+
+	bad = runeerror
+
+	runemax = 0x10FFFF /* maximum rune value */
+)
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune
+ * that works on strings that are not necessarily null-terminated.
+ *
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+func charntorune(s string) (rune, int) {
+	/* When we're not allowed to read anything */
+	if len(s) <= 0 {
+		return bad, 1
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c := s[0]
+	if c < tx {
+		return rune(c), 1
+	}
+
+	// If we can't read more than one character we must stop
+	if len(s) <= 1 {
+		return bad, 1
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => t2 tx
+	 */
+	c1 := s[1] ^ tx
+	if (c1 & testx) != 0 {
+		return bad, 1
+	}
+	if c < t3 {
+		if c < t2 {
+			return bad, 1
+		}
+		l := ((rune(c) << bitx) | rune(c1)) & rune2
+		if l <= rune1 {
+			return bad, 1
+		}
+		return l, 2
+	}
+
+	// If we can't read more than two characters we must stop
+	if len(s) <= 2 {
+		return bad, 1
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => t3 tx tx
+	 */
+	c2 := s[2] ^ tx
+	if (c2 & testx) != 0 {
+		return bad, 1
+	}
+	if c < t4 {
+		l := ((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) & rune3
+		if l <= rune2 {
+			return bad, 1
+		}
+		if surrogateMin <= l && l <= surrogateMax {
+			return bad, 1
+		}
+		return l, 3
+	}
+
+	if len(s) <= 3 {
+		return bad, 1
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => t4 tx tx tx
+	 */
+	c3 := s[3] ^ tx
+	if (c3 & testx) != 0 {
+		return bad, 1
+	}
+	if c < t5 {
+		l := ((((((rune(c) << bitx) | rune(c1)) << bitx) | rune(c2)) << bitx) | rune(c3)) & rune4
+		if l <= rune3 || l > runemax {
+			return bad, 1
+		}
+		return l, 4
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just return bad.
+	return bad, 1
+}
+
+// runetochar converts r to bytes and writes the result to str.
+// returns the number of bytes generated.
+func runetochar(str []byte, r rune) int {
+	/* runes are signed, so convert to unsigned for range check. */
+	c := uint32(r)
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	if c <= rune1 {
+		str[0] = byte(c)
+		return 1
+	}
+	/*
+	 * two character sequence
+	 *	0080-07FF => t2 tx
+	 */
+	if c <= rune2 {
+		str[0] = byte(t2 | (c >> (1 * bitx)))
+		str[1] = byte(tx | (c & maskx))
+		return 2
+	}
+
+	/*
+	 * If the rune is out of range or a surrogate half, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * rune wouldn't have fit in one or two bytes.
+	 */
+	if c > runemax {
+		c = runeerror
+	}
+	if surrogateMin <= c && c <= surrogateMax {
+		c = runeerror
+	}
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => t3 tx tx
+	 */
+	if c <= rune3 {
+		str[0] = byte(t3 | (c >> (2 * bitx)))
+		str[1] = byte(tx | ((c >> (1 * bitx)) & maskx))
+		str[2] = byte(tx | (c & maskx))
+		return 3
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => t4 tx tx tx
+	 */
+	str[0] = byte(t4 | (c >> (3 * bitx)))
+	str[1] = byte(tx | ((c >> (2 * bitx)) & maskx))
+	str[2] = byte(tx | ((c >> (1 * bitx)) & maskx))
+	str[3] = byte(tx | (c & maskx))
+	return 4
+}
diff --git a/src/runtime/runtime-gdb.py b/src/runtime/runtime-gdb.py
new file mode 100644
index 000000000..eedac7cf4
--- /dev/null
+++ b/src/runtime/runtime-gdb.py
@@ -0,0 +1,478 @@
+# Copyright 2010 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""GDB Pretty printers and convenience functions for Go's runtime structures.
+
+This script is loaded by GDB when it finds a .debug_gdb_scripts
+section in the compiled binary. The [68]l linkers emit this with a
+path to this file based on the path to the runtime package.
+"""
+
+# Known issues:
+#    - pretty printing only works for the 'native' strings. E.g. 'type
+#      foo string' will make foo a plain struct in the eyes of gdb,
+#      circumventing the pretty print triggering.
+
+
+from __future__ import print_function
+import re
+import sys
+
+print("Loading Go Runtime support.", file=sys.stderr)
+#http://python3porting.com/differences.html
+if sys.version > '3':
+	xrange = range
+# allow to manually reload while developing
+goobjfile = gdb.current_objfile() or gdb.objfiles()[0]
+goobjfile.pretty_printers = []
+
+#
+#  Pretty Printers
+#
+
+
+class StringTypePrinter:
+	"Pretty print Go strings."
+
+	pattern = re.compile(r'^struct string$')
+
+	def __init__(self, val):
+		self.val = val
+
+	def display_hint(self):
+		return 'string'
+
+	def to_string(self):
+		l = int(self.val['len'])
+		return self.val['str'].string("utf-8", "ignore", l)
+
+
+class SliceTypePrinter:
+	"Pretty print slices."
+
+	pattern = re.compile(r'^struct \[\]')
+
+	def __init__(self, val):
+		self.val = val
+
+	def display_hint(self):
+		return 'array'
+
+	def to_string(self):
+		return str(self.val.type)[6:]  # skip 'struct '
+
+	def children(self):
+		if self.val["len"] > self.val["cap"]:
+			return
+		ptr = self.val["array"]
+		for idx in range(int(self.val["len"])):
+			yield ('[{0}]'.format(idx), (ptr + idx).dereference())
+
+
+class MapTypePrinter:
+	"""Pretty print map[K]V types.
+
+	Map-typed go variables are really pointers. dereference them in gdb
+	to inspect their contents with this pretty printer.
+	"""
+
+	pattern = re.compile(r'^map\[.*\].*$')
+
+	def __init__(self, val):
+		self.val = val
+
+	def display_hint(self):
+		return 'map'
+
+	def to_string(self):
+		return str(self.val.type)
+
+	def children(self):
+		B = self.val['b']
+		buckets = self.val['buckets']
+		oldbuckets = self.val['oldbuckets']
+		flags = self.val['flags']
+		inttype = self.val['hash0'].type
+		cnt = 0
+		for bucket in xrange(2 ** int(B)):
+			bp = buckets + bucket
+			if oldbuckets:
+				oldbucket = bucket & (2 ** (B - 1) - 1)
+				oldbp = oldbuckets + oldbucket
+				oldb = oldbp.dereference()
+				if (oldb['overflow'].cast(inttype) & 1) == 0:  # old bucket not evacuated yet
+					if bucket >= 2 ** (B - 1):
+						continue    # already did old bucket
+					bp = oldbp
+			while bp:
+				b = bp.dereference()
+				for i in xrange(8):
+					if b['tophash'][i] != 0:
+						k = b['keys'][i]
+						v = b['values'][i]
+						if flags & 1:
+							k = k.dereference()
+						if flags & 2:
+							v = v.dereference()
+						yield str(cnt), k
+						yield str(cnt + 1), v
+						cnt += 2
+				bp = b['overflow']
+
+
+class ChanTypePrinter:
+	"""Pretty print chan[T] types.
+
+	Chan-typed go variables are really pointers. dereference them in gdb
+	to inspect their contents with this pretty printer.
+	"""
+
+	pattern = re.compile(r'^struct hchan<.*>$')
+
+	def __init__(self, val):
+		self.val = val
+
+	def display_hint(self):
+		return 'array'
+
+	def to_string(self):
+		return str(self.val.type)
+
+	def children(self):
+		# see chan.c chanbuf(). et is the type stolen from hchan<T>::recvq->first->elem
+		et = [x.type for x in self.val['recvq']['first'].type.target().fields() if x.name == 'elem'][0]
+		ptr = (self.val.address + 1).cast(et.pointer())
+		for i in range(self.val["qcount"]):
+			j = (self.val["recvx"] + i) % self.val["dataqsiz"]
+			yield ('[{0}]'.format(i), (ptr + j).dereference())
+
+
+#
+#  Register all the *Printer classes above.
+#
+
+def makematcher(klass):
+	def matcher(val):
+		try:
+			if klass.pattern.match(str(val.type)):
+				return klass(val)
+		except Exception:
+			pass
+	return matcher
+
+goobjfile.pretty_printers.extend([makematcher(var) for var in vars().values() if hasattr(var, 'pattern')])
+
+#
+#  For reference, this is what we're trying to do:
+#  eface: p *(*(struct 'runtime.rtype'*)'main.e'->type_->data)->string
+#  iface: p *(*(struct 'runtime.rtype'*)'main.s'->tab->Type->data)->string
+#
+# interface types can't be recognized by their name, instead we check
+# if they have the expected fields.  Unfortunately the mapping of
+# fields to python attributes in gdb.py isn't complete: you can't test
+# for presence other than by trapping.
+
+
+def is_iface(val):
+	try:
+		return str(val['tab'].type) == "struct runtime.itab *" and str(val['data'].type) == "void *"
+	except gdb.error:
+		pass
+
+
+def is_eface(val):
+	try:
+		return str(val['_type'].type) == "struct runtime._type *" and str(val['data'].type) == "void *"
+	except gdb.error:
+		pass
+
+
+def lookup_type(name):
+	try:
+		return gdb.lookup_type(name)
+	except gdb.error:
+		pass
+	try:
+		return gdb.lookup_type('struct ' + name)
+	except gdb.error:
+		pass
+	try:
+		return gdb.lookup_type('struct ' + name[1:]).pointer()
+	except gdb.error:
+		pass
+
+_rctp_type = gdb.lookup_type("struct runtime.rtype").pointer()
+
+
+def iface_commontype(obj):
+	if is_iface(obj):
+		go_type_ptr = obj['tab']['_type']
+	elif is_eface(obj):
+		go_type_ptr = obj['_type']
+	else:
+		return
+
+	return go_type_ptr.cast(_rctp_type).dereference()
+
+
+def iface_dtype(obj):
+	"Decode type of the data field of an eface or iface struct."
+	# known issue: dtype_name decoded from runtime.rtype is "nested.Foo"
+	# but the dwarf table lists it as "full/path/to/nested.Foo"
+
+	dynamic_go_type = iface_commontype(obj)
+	if dynamic_go_type is None:
+		return
+	dtype_name = dynamic_go_type['string'].dereference()['str'].string()
+
+	dynamic_gdb_type = lookup_type(dtype_name)
+	if dynamic_gdb_type is None:
+		return
+
+	type_size = int(dynamic_go_type['size'])
+	uintptr_size = int(dynamic_go_type['size'].type.sizeof)	 # size is itself an uintptr
+	if type_size > uintptr_size:
+			dynamic_gdb_type = dynamic_gdb_type.pointer()
+
+	return dynamic_gdb_type
+
+
+def iface_dtype_name(obj):
+	"Decode type name of the data field of an eface or iface struct."
+
+	dynamic_go_type = iface_commontype(obj)
+	if dynamic_go_type is None:
+		return
+	return dynamic_go_type['string'].dereference()['str'].string()
+
+
+class IfacePrinter:
+	"""Pretty print interface values
+
+	Casts the data field to the appropriate dynamic type."""
+
+	def __init__(self, val):
+		self.val = val
+
+	def display_hint(self):
+		return 'string'
+
+	def to_string(self):
+		if self.val['data'] == 0:
+			return 0x0
+		try:
+			dtype = iface_dtype(self.val)
+		except Exception:
+			return "<bad dynamic type>"
+
+		if dtype is None:  # trouble looking up, print something reasonable
+			return "({0}){0}".format(iface_dtype_name(self.val), self.val['data'])
+
+		try:
+			return self.val['data'].cast(dtype).dereference()
+		except Exception:
+			pass
+		return self.val['data'].cast(dtype)
+
+
+def ifacematcher(val):
+	if is_iface(val) or is_eface(val):
+		return IfacePrinter(val)
+
+goobjfile.pretty_printers.append(ifacematcher)
+
+#
+#  Convenience Functions
+#
+
+
+class GoLenFunc(gdb.Function):
+	"Length of strings, slices, maps or channels"
+
+	how = ((StringTypePrinter, 'len'), (SliceTypePrinter, 'len'), (MapTypePrinter, 'count'), (ChanTypePrinter, 'qcount'))
+
+	def __init__(self):
+		gdb.Function.__init__(self, "len")
+
+	def invoke(self, obj):
+		typename = str(obj.type)
+		for klass, fld in self.how:
+			if klass.pattern.match(typename):
+				return obj[fld]
+
+
+class GoCapFunc(gdb.Function):
+	"Capacity of slices or channels"
+
+	how = ((SliceTypePrinter, 'cap'), (ChanTypePrinter, 'dataqsiz'))
+
+	def __init__(self):
+		gdb.Function.__init__(self, "cap")
+
+	def invoke(self, obj):
+		typename = str(obj.type)
+		for klass, fld in self.how:
+			if klass.pattern.match(typename):
+				return obj[fld]
+
+
+class DTypeFunc(gdb.Function):
+	"""Cast Interface values to their dynamic type.
+
+	For non-interface types this behaves as the identity operation.
+	"""
+
+	def __init__(self):
+		gdb.Function.__init__(self, "dtype")
+
+	def invoke(self, obj):
+		try:
+			return obj['data'].cast(iface_dtype(obj))
+		except gdb.error:
+			pass
+		return obj
+
+#
+#  Commands
+#
+
+sts = ('idle', 'runnable', 'running', 'syscall', 'waiting', 'moribund', 'dead', 'recovery')
+
+
+def linked_list(ptr, linkfield):
+	while ptr:
+		yield ptr
+		ptr = ptr[linkfield]
+
+
+class GoroutinesCmd(gdb.Command):
+	"List all goroutines."
+
+	def __init__(self):
+		gdb.Command.__init__(self, "info goroutines", gdb.COMMAND_STACK, gdb.COMPLETE_NONE)
+
+	def invoke(self, _arg, _from_tty):
+		# args = gdb.string_to_argv(arg)
+		vp = gdb.lookup_type('void').pointer()
+		for ptr in linked_list(gdb.parse_and_eval("'runtime.allg'"), 'alllink'):
+			if ptr['status'] == 6:  # 'gdead'
+				continue
+			s = ' '
+			if ptr['m']:
+				s = '*'
+			pc = ptr['sched']['pc'].cast(vp)
+			# python2 will not cast pc (type void*) to an int cleanly
+			# instead python2 and python3 work with the hex string representation
+			# of the void pointer which we can parse back into an int.
+			# int(pc) will not work.
+			try:
+				#python3 / newer versions of gdb
+				pc = int(pc)
+			except gdb.error:
+				pc = int(str(pc), 16)
+			blk = gdb.block_for_pc(pc)
+			print(s, ptr['goid'], "{0:8s}".format(sts[int(ptr['status'])]), blk.function)
+
+
+def find_goroutine(goid):
+	"""
+	find_goroutine attempts to find the goroutine identified by goid.
+	It returns a touple of gdv.Value's representing the stack pointer
+	and program counter pointer for the goroutine.
+
+	@param int goid
+
+	@return tuple (gdb.Value, gdb.Value)
+	"""
+	vp = gdb.lookup_type('void').pointer()
+	for ptr in linked_list(gdb.parse_and_eval("'runtime.allg'"), 'alllink'):
+		if ptr['status'] == 6:  # 'gdead'
+			continue
+		if ptr['goid'] == goid:
+			return (ptr['sched'][x].cast(vp) for x in ('pc', 'sp'))
+	return None, None
+
+
+class GoroutineCmd(gdb.Command):
+	"""Execute gdb command in the context of goroutine <goid>.
+
+	Switch PC and SP to the ones in the goroutine's G structure,
+	execute an arbitrary gdb command, and restore PC and SP.
+
+	Usage: (gdb) goroutine <goid> <gdbcmd>
+
+	Note that it is ill-defined to modify state in the context of a goroutine.
+	Restrict yourself to inspecting values.
+	"""
+
+	def __init__(self):
+		gdb.Command.__init__(self, "goroutine", gdb.COMMAND_STACK, gdb.COMPLETE_NONE)
+
+	def invoke(self, arg, _from_tty):
+		goid, cmd = arg.split(None, 1)
+		goid = gdb.parse_and_eval(goid)
+		pc, sp = find_goroutine(int(goid))
+		if not pc:
+			print("No such goroutine: ", goid)
+			return
+		try:
+			#python3 / newer versions of gdb
+			pc = int(pc)
+		except gdb.error:
+			pc = int(str(pc), 16)
+		save_frame = gdb.selected_frame()
+		gdb.parse_and_eval('$save_pc = $pc')
+		gdb.parse_and_eval('$save_sp = $sp')
+		gdb.parse_and_eval('$pc = {0}'.format(str(pc)))
+		gdb.parse_and_eval('$sp = {0}'.format(str(sp)))
+		try:
+			gdb.execute(cmd)
+		finally:
+			gdb.parse_and_eval('$pc = $save_pc')
+			gdb.parse_and_eval('$sp = $save_sp')
+			save_frame.select()
+
+
+class GoIfaceCmd(gdb.Command):
+	"Print Static and dynamic interface types"
+
+	def __init__(self):
+		gdb.Command.__init__(self, "iface", gdb.COMMAND_DATA, gdb.COMPLETE_SYMBOL)
+
+	def invoke(self, arg, _from_tty):
+		for obj in gdb.string_to_argv(arg):
+			try:
+				#TODO fix quoting for qualified variable names
+				obj = gdb.parse_and_eval(str(obj))
+			except Exception as e:
+				print("Can't parse ", obj, ": ", e)
+				continue
+
+			if obj['data'] == 0:
+				dtype = "nil"
+			else:
+				dtype = iface_dtype(obj)
+
+			if dtype is None:
+				print("Not an interface: ", obj.type)
+				continue
+
+			print("{0}: {1}".format(obj.type, dtype))
+
+# TODO: print interface's methods and dynamic type's func pointers thereof.
+#rsc: "to find the number of entries in the itab's Fn field look at
+# itab.inter->numMethods
+# i am sure i have the names wrong but look at the interface type
+# and its method count"
+# so Itype will start with a commontype which has kind = interface
+
+#
+# Register all convenience functions and CLI commands
+#
+GoLenFunc()
+GoCapFunc()
+DTypeFunc()
+GoroutinesCmd()
+GoroutineCmd()
+GoIfaceCmd()
diff --git a/src/runtime/runtime.c b/src/runtime/runtime.c
new file mode 100644
index 000000000..c823691ec
--- /dev/null
+++ b/src/runtime/runtime.c
@@ -0,0 +1,399 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "stack.h"
+#include "arch_GOARCH.h"
+#include "textflag.h"
+#include "malloc.h"
+
+// Keep a cached value to make gotraceback fast,
+// since we call it on every call to gentraceback.
+// The cached value is a uint32 in which the low bit
+// is the "crash" setting and the top 31 bits are the
+// gotraceback value.
+static uint32 traceback_cache = 2<<1;
+
+// The GOTRACEBACK environment variable controls the
+// behavior of a Go program that is crashing and exiting.
+//	GOTRACEBACK=0   suppress all tracebacks
+//	GOTRACEBACK=1   default behavior - show tracebacks but exclude runtime frames
+//	GOTRACEBACK=2   show tracebacks including runtime frames
+//	GOTRACEBACK=crash   show tracebacks including runtime frames, then crash (core dump etc)
+#pragma textflag NOSPLIT
+int32
+runtime·gotraceback(bool *crash)
+{
+	if(crash != nil)
+		*crash = false;
+	if(g->m->traceback != 0)
+		return g->m->traceback;
+	if(crash != nil)
+		*crash = traceback_cache&1;
+	return traceback_cache>>1;
+}
+
+int32
+runtime·mcmp(byte *s1, byte *s2, uintptr n)
+{
+	uintptr i;
+	byte c1, c2;
+
+	for(i=0; i<n; i++) {
+		c1 = s1[i];
+		c2 = s2[i];
+		if(c1 < c2)
+			return -1;
+		if(c1 > c2)
+			return +1;
+	}
+	return 0;
+}
+
+
+byte*
+runtime·mchr(byte *p, byte c, byte *ep)
+{
+	for(; p < ep; p++)
+		if(*p == c)
+			return p;
+	return nil;
+}
+
+static int32	argc;
+
+#pragma dataflag NOPTR /* argv not a heap pointer */
+static uint8**	argv;
+
+extern Slice runtime·argslice;
+extern Slice runtime·envs;
+
+void (*runtime·sysargs)(int32, uint8**);
+
+void
+runtime·args(int32 c, uint8 **v)
+{
+	argc = c;
+	argv = v;
+	if(runtime·sysargs != nil)
+		runtime·sysargs(c, v);
+}
+
+int32 runtime·isplan9;
+int32 runtime·issolaris;
+int32 runtime·iswindows;
+
+// Information about what cpu features are available.
+// Set on startup in asm_{x86/amd64}.s.
+uint32 runtime·cpuid_ecx;
+uint32 runtime·cpuid_edx;
+
+void
+runtime·goargs(void)
+{
+	String *s;
+	int32 i;
+
+	// for windows implementation see "os" package
+	if(Windows)
+		return;
+
+	runtime·argslice = runtime·makeStringSlice(argc);
+	s = (String*)runtime·argslice.array;
+	for(i=0; i<argc; i++)
+		s[i] = runtime·gostringnocopy(argv[i]);
+}
+
+void
+runtime·goenvs_unix(void)
+{
+	String *s;
+	int32 i, n;
+
+	for(n=0; argv[argc+1+n] != 0; n++)
+		;
+
+	runtime·envs = runtime·makeStringSlice(n);
+	s = (String*)runtime·envs.array;
+	for(i=0; i<n; i++)
+		s[i] = runtime·gostringnocopy(argv[argc+1+i]);
+}
+
+#pragma textflag NOSPLIT
+Slice
+runtime·environ()
+{
+	return runtime·envs;
+}
+
+int32
+runtime·atoi(byte *p)
+{
+	int32 n;
+
+	n = 0;
+	while('0' <= *p && *p <= '9')
+		n = n*10 + *p++ - '0';
+	return n;
+}
+
+static void
+TestAtomic64(void)
+{
+	uint64 z64, x64;
+
+	z64 = 42;
+	x64 = 0;
+	PREFETCH(&z64);
+	if(runtime·cas64(&z64, x64, 1))
+		runtime·throw("cas64 failed");
+	if(x64 != 0)
+		runtime·throw("cas64 failed");
+	x64 = 42;
+	if(!runtime·cas64(&z64, x64, 1))
+		runtime·throw("cas64 failed");
+	if(x64 != 42 || z64 != 1)
+		runtime·throw("cas64 failed");
+	if(runtime·atomicload64(&z64) != 1)
+		runtime·throw("load64 failed");
+	runtime·atomicstore64(&z64, (1ull<<40)+1);
+	if(runtime·atomicload64(&z64) != (1ull<<40)+1)
+		runtime·throw("store64 failed");
+	if(runtime·xadd64(&z64, (1ull<<40)+1) != (2ull<<40)+2)
+		runtime·throw("xadd64 failed");
+	if(runtime·atomicload64(&z64) != (2ull<<40)+2)
+		runtime·throw("xadd64 failed");
+	if(runtime·xchg64(&z64, (3ull<<40)+3) != (2ull<<40)+2)
+		runtime·throw("xchg64 failed");
+	if(runtime·atomicload64(&z64) != (3ull<<40)+3)
+		runtime·throw("xchg64 failed");
+}
+
+void
+runtime·check(void)
+{
+	int8 a;
+	uint8 b;
+	int16 c;
+	uint16 d;
+	int32 e;
+	uint32 f;
+	int64 g;
+	uint64 h;
+	float32 i, i1;
+	float64 j, j1;
+	byte *k, *k1;
+	uint16* l;
+	struct x1 {
+		byte x;
+	};
+	struct y1 {
+		struct x1 x1;
+		byte y;
+	};
+
+	if(sizeof(a) != 1) runtime·throw("bad a");
+	if(sizeof(b) != 1) runtime·throw("bad b");
+	if(sizeof(c) != 2) runtime·throw("bad c");
+	if(sizeof(d) != 2) runtime·throw("bad d");
+	if(sizeof(e) != 4) runtime·throw("bad e");
+	if(sizeof(f) != 4) runtime·throw("bad f");
+	if(sizeof(g) != 8) runtime·throw("bad g");
+	if(sizeof(h) != 8) runtime·throw("bad h");
+	if(sizeof(i) != 4) runtime·throw("bad i");
+	if(sizeof(j) != 8) runtime·throw("bad j");
+	if(sizeof(k) != sizeof(uintptr)) runtime·throw("bad k");
+	if(sizeof(l) != sizeof(uintptr)) runtime·throw("bad l");
+	if(sizeof(struct x1) != 1) runtime·throw("bad sizeof x1");
+	if(offsetof(struct y1, y) != 1) runtime·throw("bad offsetof y1.y");
+	if(sizeof(struct y1) != 2) runtime·throw("bad sizeof y1");
+
+	if(runtime·timediv(12345LL*1000000000+54321, 1000000000, &e) != 12345 || e != 54321)
+		runtime·throw("bad timediv");
+
+	uint32 z;
+	z = 1;
+	if(!runtime·cas(&z, 1, 2))
+		runtime·throw("cas1");
+	if(z != 2)
+		runtime·throw("cas2");
+
+	z = 4;
+	if(runtime·cas(&z, 5, 6))
+		runtime·throw("cas3");
+	if(z != 4)
+		runtime·throw("cas4");
+
+	k = (byte*)0xfedcb123;
+	if(sizeof(void*) == 8)
+		k = (byte*)((uintptr)k<<10);
+	if(runtime·casp((void**)&k, nil, nil))
+		runtime·throw("casp1");
+	k1 = k+1;
+	if(!runtime·casp((void**)&k, k, k1))
+		runtime·throw("casp2");
+	if(k != k1)
+		runtime·throw("casp3");
+
+	*(uint64*)&j = ~0ULL;
+	if(j == j)
+		runtime·throw("float64nan");
+	if(!(j != j))
+		runtime·throw("float64nan1");
+
+	*(uint64*)&j1 = ~1ULL;
+	if(j == j1)
+		runtime·throw("float64nan2");
+	if(!(j != j1))
+		runtime·throw("float64nan3");
+
+	*(uint32*)&i = ~0UL;
+	if(i == i)
+		runtime·throw("float32nan");
+	if(!(i != i))
+		runtime·throw("float32nan1");
+
+	*(uint32*)&i1 = ~1UL;
+	if(i == i1)
+		runtime·throw("float32nan2");
+	if(!(i != i1))
+		runtime·throw("float32nan3");
+
+	TestAtomic64();
+
+	if(FixedStack != runtime·round2(FixedStack))
+		runtime·throw("FixedStack is not power-of-2");
+}
+
+#pragma dataflag NOPTR
+DebugVars	runtime·debug;
+
+typedef struct DbgVar DbgVar;
+struct DbgVar
+{
+	int8*	name;
+	int32*	value;
+};
+
+// Do we report invalid pointers found during stack or heap scans?
+int32 runtime·invalidptr = 1;
+
+#pragma dataflag NOPTR /* dbgvar has no heap pointers */
+static DbgVar dbgvar[] = {
+	{"allocfreetrace", &runtime·debug.allocfreetrace},
+	{"invalidptr", &runtime·invalidptr},
+	{"efence", &runtime·debug.efence},
+	{"gctrace", &runtime·debug.gctrace},
+	{"gcdead", &runtime·debug.gcdead},
+	{"scheddetail", &runtime·debug.scheddetail},
+	{"schedtrace", &runtime·debug.schedtrace},
+	{"scavenge", &runtime·debug.scavenge},
+};
+
+void
+runtime·parsedebugvars(void)
+{
+	byte *p;
+	intgo i, n;
+
+	p = runtime·getenv("GODEBUG");
+	if(p != nil){
+		for(;;) {
+			for(i=0; i<nelem(dbgvar); i++) {
+				n = runtime·findnull((byte*)dbgvar[i].name);
+				if(runtime·mcmp(p, (byte*)dbgvar[i].name, n) == 0 && p[n] == '=')
+					*dbgvar[i].value = runtime·atoi(p+n+1);
+			}
+			p = runtime·strstr(p, (byte*)",");
+			if(p == nil)
+				break;
+			p++;
+		}
+	}
+
+	p = runtime·getenv("GOTRACEBACK");
+	if(p == nil)
+		p = (byte*)"";
+	if(p[0] == '\0')
+		traceback_cache = 1<<1;
+	else if(runtime·strcmp(p, (byte*)"crash") == 0)
+		traceback_cache = (2<<1) | 1;
+	else
+		traceback_cache = runtime·atoi(p)<<1;	
+}
+
+// Poor mans 64-bit division.
+// This is a very special function, do not use it if you are not sure what you are doing.
+// int64 division is lowered into _divv() call on 386, which does not fit into nosplit functions.
+// Handles overflow in a time-specific manner.
+#pragma textflag NOSPLIT
+int32
+runtime·timediv(int64 v, int32 div, int32 *rem)
+{
+	int32 res, bit;
+
+	res = 0;
+	for(bit = 30; bit >= 0; bit--) {
+		if(v >= ((int64)div<<bit)) {
+			v = v - ((int64)div<<bit);
+			res += 1<<bit;
+		}
+	}
+	if(v >= (int64)div) {
+		if(rem != nil)
+			*rem = 0;
+		return 0x7fffffff;
+	}
+	if(rem != nil)
+		*rem = v;
+	return res;
+}
+
+// Helpers for Go. Must be NOSPLIT, must only call NOSPLIT functions, and must not block.
+
+#pragma textflag NOSPLIT
+G*
+runtime·getg(void)
+{
+	return g;
+}
+
+#pragma textflag NOSPLIT
+M*
+runtime·acquirem(void)
+{
+	g->m->locks++;
+	return g->m;
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·releasem(M *mp)
+{
+	mp->locks--;
+	if(mp->locks == 0 && g->preempt) {
+		// restore the preemption request in case we've cleared it in newstack
+		g->stackguard0 = StackPreempt;
+	}
+}
+
+#pragma textflag NOSPLIT
+MCache*
+runtime·gomcache(void)
+{
+	return g->m->mcache;
+}
+
+#pragma textflag NOSPLIT
+Slice
+reflect·typelinks(void)
+{
+	extern Type *runtime·typelink[], *runtime·etypelink[];
+	Slice ret;
+
+	ret.array = (byte*)runtime·typelink;
+	ret.len = runtime·etypelink - runtime·typelink;
+	ret.cap = ret.len;
+	return ret;
+}
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
new file mode 100644
index 000000000..4e4e1d17a
--- /dev/null
+++ b/src/runtime/runtime.go
@@ -0,0 +1,60 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+var ticks struct {
+	lock mutex
+	val  uint64
+}
+
+var tls0 [8]uintptr // available storage for m0's TLS; not necessarily used; opaque to GC
+
+// Note: Called by runtime/pprof in addition to runtime code.
+func tickspersecond() int64 {
+	r := int64(atomicload64(&ticks.val))
+	if r != 0 {
+		return r
+	}
+	lock(&ticks.lock)
+	r = int64(ticks.val)
+	if r == 0 {
+		t0 := nanotime()
+		c0 := cputicks()
+		usleep(100 * 1000)
+		t1 := nanotime()
+		c1 := cputicks()
+		if t1 == t0 {
+			t1++
+		}
+		r = (c1 - c0) * 1000 * 1000 * 1000 / (t1 - t0)
+		if r == 0 {
+			r++
+		}
+		atomicstore64(&ticks.val, uint64(r))
+	}
+	unlock(&ticks.lock)
+	return r
+}
+
+func makeStringSlice(n int) []string {
+	return make([]string, n)
+}
+
+// TODO: Move to parfor.go when parfor.c becomes parfor.go.
+func parforalloc(nthrmax uint32) *parfor {
+	return &parfor{
+		thr:     &make([]parforthread, nthrmax)[0],
+		nthrmax: nthrmax,
+	}
+}
+
+var envs []string
+var argslice []string
+
+// called from syscall
+func runtime_envs() []string { return envs }
+
+// called from os
+func runtime_args() []string { return argslice }
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
new file mode 100644
index 000000000..177a1287e
--- /dev/null
+++ b/src/runtime/runtime.h
@@ -0,0 +1,1132 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * basic types
+ */
+typedef	signed char		int8;
+typedef	unsigned char		uint8;
+typedef	signed short		int16;
+typedef	unsigned short		uint16;
+typedef	signed int		int32;
+typedef	unsigned int		uint32;
+typedef	signed long long int	int64;
+typedef	unsigned long long int	uint64;
+typedef	float			float32;
+typedef	double			float64;
+
+#ifdef _64BIT
+typedef	uint64		uintptr;
+typedef	int64		intptr;
+typedef	int64		intgo; // Go's int
+typedef	uint64		uintgo; // Go's uint
+#else
+typedef	uint32		uintptr;
+typedef	int32		intptr;
+typedef	int32		intgo; // Go's int
+typedef	uint32		uintgo; // Go's uint
+#endif
+
+#ifdef _64BITREG
+typedef	uint64		uintreg;
+#else
+typedef	uint32		uintreg;
+#endif
+
+/*
+ * get rid of C types
+ * the / / / forces a syntax error immediately,
+ * which will show "last name: XXunsigned".
+ */
+#define	unsigned		XXunsigned / / /
+#define	signed			XXsigned / / /
+#define	char			XXchar / / /
+#define	short			XXshort / / /
+#define	int			XXint / / /
+#define	long			XXlong / / /
+#define	float			XXfloat / / /
+#define	double			XXdouble / / /
+
+/*
+ * defined types
+ */
+typedef	uint8			bool;
+typedef	uint8			byte;
+typedef	struct	Func		Func;
+typedef	struct	G		G;
+typedef	struct	Gobuf		Gobuf;
+typedef	struct	SudoG		SudoG;
+typedef	struct	Mutex		Mutex;
+typedef	struct	M		M;
+typedef	struct	P		P;
+typedef	struct	SchedT	SchedT;
+typedef	struct	Note		Note;
+typedef	struct	Slice		Slice;
+typedef	struct	String		String;
+typedef	struct	FuncVal		FuncVal;
+typedef	struct	SigTab		SigTab;
+typedef	struct	MCache		MCache;
+typedef	struct	FixAlloc	FixAlloc;
+typedef	struct	Iface		Iface;
+typedef	struct	Itab		Itab;
+typedef	struct	InterfaceType	InterfaceType;
+typedef	struct	Eface		Eface;
+typedef	struct	Type		Type;
+typedef	struct	PtrType		PtrType;
+typedef	struct	ChanType	ChanType;
+typedef	struct	MapType		MapType;
+typedef	struct	Defer		Defer;
+typedef	struct	Panic		Panic;
+typedef	struct	Hmap		Hmap;
+typedef	struct	Hiter		Hiter;
+typedef	struct	Hchan		Hchan;
+typedef	struct	Complex64	Complex64;
+typedef	struct	Complex128	Complex128;
+typedef	struct	LibCall		LibCall;
+typedef	struct	WinCallbackContext	WinCallbackContext;
+typedef	struct	GCStats		GCStats;
+typedef	struct	LFNode		LFNode;
+typedef	struct	ParFor		ParFor;
+typedef	struct	ParForThread	ParForThread;
+typedef	struct	CgoMal		CgoMal;
+typedef	struct	PollDesc	PollDesc;
+typedef	struct	DebugVars	DebugVars;
+typedef	struct	ForceGCState	ForceGCState;
+typedef	struct	Stack		Stack;
+
+/*
+ * Per-CPU declaration.
+ *
+ * "extern register" is a special storage class implemented by 6c, 8c, etc.
+ * On the ARM, it is an actual register; elsewhere it is a slot in thread-
+ * local storage indexed by a pseudo-register TLS. See zasmhdr in
+ * src/cmd/dist/buildruntime.c for details, and be aware that the linker may
+ * make further OS-specific changes to the compiler's output. For example,
+ * 6l/linux rewrites 0(TLS) as -8(FS).
+ *
+ * Every C file linked into a Go program must include runtime.h so that the
+ * C compiler (6c, 8c, etc.) knows to avoid other uses of these dedicated
+ * registers. The Go compiler (6g, 8g, etc.) knows to avoid them.
+ */
+extern	register	G*	g;
+
+/*
+ * defined constants
+ */
+enum
+{
+	// G status
+	//
+	// If you add to this list, add to the list
+	// of "okay during garbage collection" status
+	// in mgc0.c too.
+	Gidle,                                 // 0
+	Grunnable,                             // 1 runnable and on a run queue
+	Grunning,                              // 2
+	Gsyscall,                              // 3
+	Gwaiting,                              // 4
+	Gmoribund_unused,                      // 5 currently unused, but hardcoded in gdb scripts
+	Gdead,                                 // 6
+	Genqueue,                              // 7 Only the Gscanenqueue is used.
+	Gcopystack,                            // 8 in this state when newstack is moving the stack
+	// the following encode that the GC is scanning the stack and what to do when it is done 
+	Gscan = 0x1000,                        // atomicstatus&~Gscan = the non-scan state,
+	// Gscanidle =     Gscan + Gidle,      // Not used. Gidle only used with newly malloced gs
+	Gscanrunnable = Gscan + Grunnable,     //  0x1001 When scanning complets make Grunnable (it is already on run queue)
+	Gscanrunning =  Gscan + Grunning,      //  0x1002 Used to tell preemption newstack routine to scan preempted stack.
+	Gscansyscall =  Gscan + Gsyscall,      //  0x1003 When scanning completes make is Gsyscall
+	Gscanwaiting =  Gscan + Gwaiting,      //  0x1004 When scanning completes make it Gwaiting
+	// Gscanmoribund_unused,               //  not possible
+	// Gscandead,                          //  not possible
+	Gscanenqueue = Gscan + Genqueue,       //  When scanning completes make it Grunnable and put on runqueue
+};
+enum
+{
+	// P status
+	Pidle,
+	Prunning,
+	Psyscall,
+	Pgcstop,
+	Pdead,
+};
+enum
+{
+	true	= 1,
+	false	= 0,
+};
+enum
+{
+	PtrSize = sizeof(void*),
+};
+/*
+ * structures
+ */
+struct	Mutex
+{
+	// Futex-based impl treats it as uint32 key,
+	// while sema-based impl as M* waitm.
+	// Used to be a union, but unions break precise GC.
+	uintptr	key;
+};
+struct	Note
+{
+	// Futex-based impl treats it as uint32 key,
+	// while sema-based impl as M* waitm.
+	// Used to be a union, but unions break precise GC.
+	uintptr	key;
+};
+struct String
+{
+	byte*	str;
+	intgo	len;
+};
+struct FuncVal
+{
+	void	(*fn)(void);
+	// variable-size, fn-specific data here
+};
+struct Iface
+{
+	Itab*	tab;
+	void*	data;
+};
+struct Eface
+{
+	Type*	type;
+	void*	data;
+};
+struct Complex64
+{
+	float32	real;
+	float32	imag;
+};
+struct Complex128
+{
+	float64	real;
+	float64	imag;
+};
+
+struct	Slice
+{				// must not move anything
+	byte*	array;		// actual data
+	uintgo	len;		// number of elements
+	uintgo	cap;		// allocated number of elements
+};
+struct	Gobuf
+{
+	// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
+	uintptr	sp;
+	uintptr	pc;
+	G*	g;
+	void*	ctxt; // this has to be a pointer so that GC scans it
+	uintreg	ret;
+	uintptr	lr;
+};
+// Known to compiler.
+// Changes here must also be made in src/cmd/gc/select.c's selecttype.
+struct	SudoG
+{
+	G*	g;
+	uint32*	selectdone;
+	SudoG*	next;
+	SudoG*	prev;
+	void*	elem;		// data element
+	int64	releasetime;
+	int32	nrelease;	// -1 for acquire
+	SudoG*	waitlink;	// G.waiting list
+};
+struct	GCStats
+{
+	// the struct must consist of only uint64's,
+	// because it is casted to uint64[].
+	uint64	nhandoff;
+	uint64	nhandoffcnt;
+	uint64	nprocyield;
+	uint64	nosyield;
+	uint64	nsleep;
+};
+
+struct	LibCall
+{
+	uintptr	fn;
+	uintptr	n;	// number of parameters
+	uintptr	args;	// parameters
+	uintptr	r1;	// return values
+	uintptr	r2;
+	uintptr	err;	// error number
+};
+
+// describes how to handle callback
+struct	WinCallbackContext
+{
+	void*	gobody;		// Go function to call
+	uintptr	argsize;	// callback arguments size (in bytes)
+	uintptr	restorestack;	// adjust stack on return by (in bytes) (386 only)
+	bool	cleanstack;
+};
+
+// Stack describes a Go execution stack.
+// The bounds of the stack are exactly [lo, hi),
+// with no implicit data structures on either side.
+struct	Stack
+{
+	uintptr	lo;
+	uintptr	hi;
+};
+
+struct	G
+{
+	// Stack parameters.
+	// stack describes the actual stack memory: [stack.lo, stack.hi).
+	// stackguard0 is the stack pointer compared in the Go stack growth prologue.
+	// It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
+	// stackguard1 is the stack pointer compared in the C stack growth prologue.
+	// It is stack.lo+StackGuard on g0 and gsignal stacks.
+	// It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
+	Stack	stack;	// offset known to runtime/cgo
+	uintptr	stackguard0;	// offset known to liblink
+	uintptr	stackguard1;	// offset known to liblink
+
+	Panic*	panic;	// innermost panic - offset known to liblink
+	Defer*	defer;	// innermost defer
+	Gobuf	sched;
+	uintptr	syscallsp;	// if status==Gsyscall, syscallsp = sched.sp to use during gc
+	uintptr	syscallpc;	// if status==Gsyscall, syscallpc = sched.pc to use during gc
+	void*	param;		// passed parameter on wakeup
+	uint32	atomicstatus;
+	int64	goid;
+	int64	waitsince;	// approx time when the G become blocked
+	String	waitreason;	// if status==Gwaiting
+	G*	schedlink;
+	bool	issystem;	// do not output in stack dump, ignore in deadlock detector
+	bool	preempt;	// preemption signal, duplicates stackguard0 = StackPreempt
+	bool	paniconfault;	// panic (instead of crash) on unexpected fault address
+	bool	preemptscan;    // preempted g does scan for GC
+	bool	gcworkdone;     // debug: cleared at begining of gc work phase cycle, set by gcphasework, tested at end of cycle
+	bool	throwsplit; // must not split stack
+	int8	raceignore;	// ignore race detection events
+	M*	m;		// for debuggers, but offset not hard-coded
+	M*	lockedm;
+	int32	sig;
+	Slice	writebuf;
+	uintptr	sigcode0;
+	uintptr	sigcode1;
+	uintptr	sigpc;
+	uintptr	gopc;		// pc of go statement that created this goroutine
+	uintptr	racectx;
+	SudoG*	waiting;	// sudog structures this G is waiting on (that have a valid elem ptr)
+	uintptr	end[];
+};
+
+struct	M
+{
+	G*	g0;		// goroutine with scheduling stack
+	Gobuf	morebuf;	// gobuf arg to morestack
+
+	// Fields not known to debuggers.
+	uint64	procid;		// for debuggers, but offset not hard-coded
+	G*	gsignal;	// signal-handling G
+	uintptr	tls[4];		// thread-local storage (for x86 extern register)
+	void	(*mstartfn)(void);
+	G*	curg;		// current running goroutine
+	G*	caughtsig;	// goroutine running during fatal signal
+	P*	p;		// attached P for executing Go code (nil if not executing Go code)
+	P*	nextp;
+	int32	id;
+	int32	mallocing;
+	int32	throwing;
+	int32	gcing;
+	int32	locks;
+	int32	softfloat;
+	int32	dying;
+	int32	profilehz;
+	int32	helpgc;
+	bool	spinning;	// M is out of work and is actively looking for work
+	bool	blocked;	// M is blocked on a Note
+	uint32	fastrand;
+	uint64	ncgocall;	// number of cgo calls in total
+	int32	ncgo;		// number of cgo calls currently in progress
+	CgoMal*	cgomal;
+	Note	park;
+	M*	alllink;	// on allm
+	M*	schedlink;
+	uint32	machport;	// Return address for Mach IPC (OS X)
+	MCache*	mcache;
+	G*	lockedg;
+	uintptr	createstack[32];// Stack that created this thread.
+	uint32	freglo[16];	// D[i] lsb and F[i]
+	uint32	freghi[16];	// D[i] msb and F[i+16]
+	uint32	fflag;		// floating point compare flags
+	uint32	locked;		// tracking for LockOSThread
+	M*	nextwaitm;	// next M waiting for lock
+	uintptr	waitsema;	// semaphore for parking on locks
+	uint32	waitsemacount;
+	uint32	waitsemalock;
+	GCStats	gcstats;
+	bool	needextram;
+	uint8	traceback;
+	bool	(*waitunlockf)(G*, void*);
+	void*	waitlock;
+	uintptr scalararg[4];	// scalar argument/return for mcall
+	void*   ptrarg[4];	// pointer argument/return for mcall
+#ifdef GOOS_windows
+	uintptr	thread;		// thread handle
+	// these are here because they are too large to be on the stack
+	// of low-level NOSPLIT functions.
+	LibCall	libcall;
+	uintptr	libcallpc;	// for cpu profiler
+	uintptr	libcallsp;
+	G*	libcallg;
+#endif
+#ifdef GOOS_solaris
+	int32*	perrno; 	// pointer to TLS errno
+	// these are here because they are too large to be on the stack
+	// of low-level NOSPLIT functions.
+	LibCall	libcall;
+	struct MTs {
+		int64	tv_sec;
+		int64	tv_nsec;
+	} ts;
+	struct MScratch {
+		uintptr v[6];
+	} scratch;
+#endif
+#ifdef GOOS_plan9
+	int8*	notesig;
+	byte*	errstr;
+#endif
+	uintptr	end[];
+};
+
+struct P
+{
+	Mutex	lock;
+
+	int32	id;
+	uint32	status;		// one of Pidle/Prunning/...
+	P*	link;
+	uint32	schedtick;	// incremented on every scheduler call
+	uint32	syscalltick;	// incremented on every system call
+	M*	m;		// back-link to associated M (nil if idle)
+	MCache*	mcache;
+	Defer*	deferpool[5];	// pool of available Defer structs of different sizes (see panic.c)
+
+	// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
+	uint64	goidcache;
+	uint64	goidcacheend;
+
+	// Queue of runnable goroutines.
+	uint32	runqhead;
+	uint32	runqtail;
+	G*	runq[256];
+
+	// Available G's (status == Gdead)
+	G*	gfree;
+	int32	gfreecnt;
+
+	byte	pad[64];
+};
+
+enum {
+	// The max value of GOMAXPROCS.
+	// There are no fundamental restrictions on the value.
+	MaxGomaxprocs = 1<<8,
+};
+
+struct	SchedT
+{
+	Mutex	lock;
+
+	uint64	goidgen;
+
+	M*	midle;	 // idle m's waiting for work
+	int32	nmidle;	 // number of idle m's waiting for work
+	int32	nmidlelocked; // number of locked m's waiting for work
+	int32	mcount;	 // number of m's that have been created
+	int32	maxmcount;	// maximum number of m's allowed (or die)
+
+	P*	pidle;  // idle P's
+	uint32	npidle;
+	uint32	nmspinning;
+
+	// Global runnable queue.
+	G*	runqhead;
+	G*	runqtail;
+	int32	runqsize;
+
+	// Global cache of dead G's.
+	Mutex	gflock;
+	G*	gfree;
+	int32	ngfree;
+
+	uint32	gcwaiting;	// gc is waiting to run
+	int32	stopwait;
+	Note	stopnote;
+	uint32	sysmonwait;
+	Note	sysmonnote;
+	uint64	lastpoll;
+
+	int32	profilehz;	// cpu profiling rate
+};
+
+// The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+// The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
+// External locks are not recursive; a second lock is silently ignored.
+// The upper bits of m->lockedcount record the nesting depth of calls to lockOSThread
+// (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
+// Internal locks can be recursive. For instance, a lock for cgo can occur while the main
+// goroutine is holding the lock during the initialization phase.
+enum
+{
+	LockExternal = 1,
+	LockInternal = 2,
+};
+
+struct	SigTab
+{
+	int32	flags;
+	int8	*name;
+};
+enum
+{
+	SigNotify = 1<<0,	// let signal.Notify have signal, even if from kernel
+	SigKill = 1<<1,		// if signal.Notify doesn't take it, exit quietly
+	SigThrow = 1<<2,	// if signal.Notify doesn't take it, exit loudly
+	SigPanic = 1<<3,	// if the signal is from the kernel, panic
+	SigDefault = 1<<4,	// if the signal isn't explicitly requested, don't monitor it
+	SigHandling = 1<<5,	// our signal handler is registered
+	SigIgnored = 1<<6,	// the signal was ignored before we registered for it
+	SigGoExit = 1<<7,	// cause all runtime procs to exit (only used on Plan 9).
+};
+
+// Layout of in-memory per-function information prepared by linker
+// See http://golang.org/s/go12symtab.
+// Keep in sync with linker and with ../../libmach/sym.c
+// and with package debug/gosym and with symtab.go in package runtime.
+struct	Func
+{
+	uintptr	entry;	// start pc
+	int32	nameoff;// function name
+	
+	int32	args;	// in/out args size
+	int32	frame;	// legacy frame size; use pcsp if possible
+
+	int32	pcsp;
+	int32	pcfile;
+	int32	pcln;
+	int32	npcdata;
+	int32	nfuncdata;
+};
+
+// layout of Itab known to compilers
+// allocated in non-garbage-collected memory
+struct	Itab
+{
+	InterfaceType*	inter;
+	Type*	type;
+	Itab*	link;
+	int32	bad;
+	int32	unused;
+	void	(*fun[])(void);
+};
+
+#ifdef GOOS_nacl
+enum {
+   NaCl = 1,
+};
+#else
+enum {
+   NaCl = 0,
+};
+#endif
+
+#ifdef GOOS_windows
+enum {
+   Windows = 1
+};
+#else
+enum {
+   Windows = 0
+};
+#endif
+#ifdef GOOS_solaris
+enum {
+   Solaris = 1
+};
+#else
+enum {
+   Solaris = 0
+};
+#endif
+#ifdef GOOS_plan9
+enum {
+   Plan9 = 1
+};
+#else
+enum {
+   Plan9 = 0
+};
+#endif
+
+// Lock-free stack node.
+struct LFNode
+{
+	LFNode	*next;
+	uintptr	pushcnt;
+};
+
+// Parallel for descriptor.
+struct ParFor
+{
+	void (*body)(ParFor*, uint32);	// executed for each element
+	uint32 done;			// number of idle threads
+	uint32 nthr;			// total number of threads
+	uint32 nthrmax;			// maximum number of threads
+	uint32 thrseq;			// thread id sequencer
+	uint32 cnt;			// iteration space [0, cnt)
+	void *ctx;			// arbitrary user context
+	bool wait;			// if true, wait while all threads finish processing,
+					// otherwise parfor may return while other threads are still working
+	ParForThread *thr;		// array of thread descriptors
+	uint32 pad;			// to align ParForThread.pos for 64-bit atomic operations
+	// stats
+	uint64 nsteal;
+	uint64 nstealcnt;
+	uint64 nprocyield;
+	uint64 nosyield;
+	uint64 nsleep;
+};
+
+// Track memory allocated by code not written in Go during a cgo call,
+// so that the garbage collector can see them.
+struct CgoMal
+{
+	CgoMal	*next;
+	void	*alloc;
+};
+
+// Holds variables parsed from GODEBUG env var.
+struct DebugVars
+{
+	int32	allocfreetrace;
+	int32	efence;
+	int32	gctrace;
+	int32	gcdead;
+	int32	scheddetail;
+	int32	schedtrace;
+	int32	scavenge;
+};
+
+// Indicates to write barrier and sychronization task to preform.
+enum
+{                   // Synchronization            Write barrier
+	GCoff,      // stop and start             nop
+	GCquiesce,  // stop and start             nop
+	GCstw,      // stop the ps                nop
+	GCmark,     // scan the stacks and start  no white to black
+	GCsweep,    // stop and start             nop
+};
+
+struct ForceGCState
+{
+	Mutex	lock;
+	G*	g;
+	uint32	idle;
+};
+
+extern uint32 runtime·gcphase;
+
+/*
+ * defined macros
+ *    you need super-gopher-guru privilege
+ *    to add this list.
+ */
+#define	nelem(x)	(sizeof(x)/sizeof((x)[0]))
+#define	nil		((void*)0)
+#define	offsetof(s,m)	(uint32)(&(((s*)0)->m))
+#define	ROUND(x, n)	(((x)+(n)-1)&~(uintptr)((n)-1)) /* all-caps to mark as macro: it evaluates n twice */
+
+/*
+ * known to compiler
+ */
+enum {
+	Structrnd = sizeof(uintreg),
+};
+
+byte*	runtime·startup_random_data;
+uint32	runtime·startup_random_data_len;
+
+int32	runtime·invalidptr;
+
+enum {
+	// hashinit wants this many random bytes
+	HashRandomBytes = 32
+};
+
+uint32  runtime·readgstatus(G*);
+void    runtime·casgstatus(G*, uint32, uint32);
+void    runtime·casgstatus(G*, uint32, uint32);
+uint32	runtime·casgcopystack(G*);
+void    runtime·quiesce(G*);
+bool    runtime·stopg(G*);
+void    runtime·restartg(G*);
+void    runtime·gcphasework(G*);
+
+/*
+ * deferred subroutine calls
+ */
+struct Defer
+{
+	int32	siz;
+	bool	started;
+	uintptr	argp;		// where args were copied from
+	uintptr	pc;
+	FuncVal*	fn;
+	Panic*	panic;	// panic that is running defer
+	Defer*	link;
+};
+
+// argp used in Defer structs when there is no argp.
+#define NoArgs ((uintptr)-1)
+
+/*
+ * panics
+ */
+struct Panic
+{
+	void*	argp;	// pointer to arguments of deferred call run during panic; cannot move - known to liblink
+	Eface	arg;		// argument to panic
+	Panic*	link;		// link to earlier panic
+	bool	recovered;	// whether this panic is over
+	bool	aborted;	// the panic was aborted
+};
+
+/*
+ * stack traces
+ */
+typedef struct Stkframe Stkframe;
+typedef struct BitVector BitVector;
+struct Stkframe
+{
+	Func*	fn;	// function being run
+	uintptr	pc;	// program counter within fn
+	uintptr	continpc;	// program counter where execution can continue, or 0 if not
+	uintptr	lr;	// program counter at caller aka link register
+	uintptr	sp;	// stack pointer at pc
+	uintptr	fp;	// stack pointer at caller aka frame pointer
+	uintptr	varp;	// top of local variables
+	uintptr	argp;	// pointer to function arguments
+	uintptr	arglen;	// number of bytes at argp
+	BitVector*	argmap;	// force use of this argmap
+};
+
+enum
+{
+	TraceRuntimeFrames = 1<<0, // include frames for internal runtime functions.
+	TraceTrap = 1<<1, // the initial PC, SP are from a trap, not a return PC from a call
+};
+intgo	runtime·gentraceback(uintptr, uintptr, uintptr, G*, intgo, uintptr*, intgo, bool(**)(Stkframe*, void*), void*, uintgo);
+void	runtime·tracebackdefers(G*, bool(**)(Stkframe*, void*), void*);
+void	runtime·traceback(uintptr pc, uintptr sp, uintptr lr, G* gp);
+void	runtime·tracebacktrap(uintptr pc, uintptr sp, uintptr lr, G* gp);
+void	runtime·tracebackothers(G*);
+bool	runtime·haszeroargs(uintptr pc);
+bool	runtime·topofstack(Func*);
+enum
+{
+	// The maximum number of frames we print for a traceback
+	TracebackMaxFrames = 100,
+};
+
+/*
+ * external data
+ */
+extern	String	runtime·emptystring;
+extern	G**	runtime·allg;
+extern	Slice	runtime·allgs; // []*G
+extern	uintptr runtime·allglen;
+extern	G*	runtime·lastg;
+extern	M*	runtime·allm;
+extern	P*	runtime·allp[MaxGomaxprocs+1];
+extern	int32	runtime·gomaxprocs;
+extern	uint32	runtime·needextram;
+extern	uint32	runtime·panicking;
+extern	int8*	runtime·goos;
+extern	int32	runtime·ncpu;
+extern	bool	runtime·iscgo;
+extern 	void	(*runtime·sysargs)(int32, uint8**);
+extern	uintptr	runtime·maxstring;
+extern	uint32	runtime·cpuid_ecx;
+extern	uint32	runtime·cpuid_edx;
+extern	DebugVars	runtime·debug;
+extern	uintptr	runtime·maxstacksize;
+extern	Note	runtime·signote;
+extern	ForceGCState	runtime·forcegc;
+extern	SchedT	runtime·sched;
+extern	int32		runtime·newprocs;
+
+/*
+ * common functions and data
+ */
+int32	runtime·strcmp(byte*, byte*);
+int32	runtime·strncmp(byte*, byte*, uintptr);
+byte*	runtime·strstr(byte*, byte*);
+intgo	runtime·findnull(byte*);
+intgo	runtime·findnullw(uint16*);
+void	runtime·dump(byte*, int32);
+int32	runtime·runetochar(byte*, int32);
+int32	runtime·charntorune(int32*, uint8*, int32);
+
+
+/*
+ * This macro is used when writing C functions
+ * called as if they were Go functions.
+ * Passed the address of a result before a return statement,
+ * it makes sure the result has been flushed to memory
+ * before the return.
+ *
+ * It is difficult to write such functions portably, because
+ * of the varying requirements on the alignment of the
+ * first output value. Almost all code should write such
+ * functions in .goc files, where goc2c (part of cmd/dist)
+ * can arrange the correct alignment for the target system.
+ * Goc2c also takes care of conveying to the garbage collector
+ * which parts of the argument list are inputs vs outputs.
+ *
+ * Therefore, do NOT use this macro if at all possible.
+ */ 
+#define FLUSH(x)	USED(x)
+
+/*
+ * GoOutput is a type with the same alignment requirements as the
+ * initial output argument from a Go function. Only for use in cases
+ * where using goc2c is not possible. See comment on FLUSH above.
+ */
+typedef uint64 GoOutput;
+
+void	runtime·gogo(Gobuf*);
+void	runtime·gostartcall(Gobuf*, void(*)(void), void*);
+void	runtime·gostartcallfn(Gobuf*, FuncVal*);
+void	runtime·gosave(Gobuf*);
+void	runtime·goargs(void);
+void	runtime·goenvs(void);
+void	runtime·goenvs_unix(void);
+void*	runtime·getu(void);
+void	runtime·throw(int8*);
+bool	runtime·canpanic(G*);
+void	runtime·prints(int8*);
+void	runtime·printf(int8*, ...);
+void	runtime·snprintf(byte*, int32, int8*, ...);
+byte*	runtime·mchr(byte*, byte, byte*);
+int32	runtime·mcmp(byte*, byte*, uintptr);
+void	runtime·memmove(void*, void*, uintptr);
+String	runtime·catstring(String, String);
+String	runtime·gostring(byte*);
+Slice	runtime·makeStringSlice(intgo);
+String  runtime·gostringn(byte*, intgo);
+Slice	runtime·gobytes(byte*, intgo);
+String	runtime·gostringnocopy(byte*);
+String	runtime·gostringw(uint16*);
+void	runtime·initsig(void);
+void	runtime·sigenable(uint32 sig);
+void	runtime·sigdisable(uint32 sig);
+int32	runtime·gotraceback(bool *crash);
+void	runtime·goroutineheader(G*);
+int32	runtime·open(int8*, int32, int32);
+int32	runtime·read(int32, void*, int32);
+int32	runtime·write(uintptr, void*, int32); // use uintptr to accommodate windows.
+int32	runtime·close(int32);
+int32	runtime·mincore(void*, uintptr, byte*);
+void	runtime·jmpdefer(FuncVal*, uintptr);
+void	runtime·exit1(int32);
+void	runtime·ready(G*);
+byte*	runtime·getenv(int8*);
+int32	runtime·atoi(byte*);
+void	runtime·newosproc(M *mp, void *stk);
+void	runtime·mstart(void);
+G*	runtime·malg(int32);
+void	runtime·asminit(void);
+void	runtime·mpreinit(M*);
+void	runtime·minit(void);
+void	runtime·unminit(void);
+void	runtime·signalstack(byte*, int32);
+void	runtime·tracebackinit(void);
+void	runtime·symtabinit(void);
+Func*	runtime·findfunc(uintptr);
+int32	runtime·funcline(Func*, uintptr, String*);
+int32	runtime·funcspdelta(Func*, uintptr);
+int8*	runtime·funcname(Func*);
+int32	runtime·pcdatavalue(Func*, int32, uintptr);
+void	runtime·stackinit(void);
+Stack	runtime·stackalloc(uint32);
+void	runtime·stackfree(Stack);
+void	runtime·shrinkstack(G*);
+void	runtime·shrinkfinish(void);
+MCache*	runtime·allocmcache(void);
+void	runtime·freemcache(MCache*);
+void	runtime·mallocinit(void);
+void	runtime·gcinit(void);
+void*	runtime·mallocgc(uintptr size, Type* typ, uint32 flag);
+void	runtime·runpanic(Panic*);
+uintptr	runtime·getcallersp(void*);
+int32	runtime·mcount(void);
+int32	runtime·gcount(void);
+void	runtime·mcall(void(**)(G*));
+void	runtime·onM(void(**)(void));
+void	runtime·onMsignal(void(**)(void));
+uint32	runtime·fastrand1(void);
+void	runtime·rewindmorestack(Gobuf*);
+int32	runtime·timediv(int64, int32, int32*);
+int32	runtime·round2(int32 x); // round x up to a power of 2.
+
+// atomic operations
+bool	runtime·cas(uint32*, uint32, uint32);
+bool	runtime·cas64(uint64*, uint64, uint64);
+bool	runtime·casp(void**, void*, void*);
+// Don't confuse with XADD x86 instruction,
+// this one is actually 'addx', that is, add-and-fetch.
+uint32	runtime·xadd(uint32 volatile*, int32);
+uint64	runtime·xadd64(uint64 volatile*, int64);
+uint32	runtime·xchg(uint32 volatile*, uint32);
+uint64	runtime·xchg64(uint64 volatile*, uint64);
+void*	runtime·xchgp(void* volatile*, void*);
+uint32	runtime·atomicload(uint32 volatile*);
+void	runtime·atomicstore(uint32 volatile*, uint32);
+void	runtime·atomicstore64(uint64 volatile*, uint64);
+uint64	runtime·atomicload64(uint64 volatile*);
+void*	runtime·atomicloadp(void* volatile*);
+uintptr	runtime·atomicloaduintptr(uintptr volatile*);
+void	runtime·atomicstorep(void* volatile*, void*);
+void	runtime·atomicstoreuintptr(uintptr volatile*, uintptr);
+void	runtime·atomicor8(byte volatile*, byte);
+
+void	runtime·setg(G*);
+void	runtime·newextram(void);
+void	runtime·exit(int32);
+void	runtime·breakpoint(void);
+void	runtime·gosched_m(G*);
+void	runtime·schedtrace(bool);
+void	runtime·park(bool(*)(G*, void*), void*, String);
+void	runtime·parkunlock(Mutex*, String);
+void	runtime·tsleep(int64, String);
+M*	runtime·newm(void);
+void	runtime·goexit(void);
+void	runtime·asmcgocall(void (*fn)(void*), void*);
+int32	runtime·asmcgocall_errno(void (*fn)(void*), void*);
+void	runtime·entersyscall(void);
+void	runtime·reentersyscall(uintptr, uintptr);
+void	runtime·entersyscallblock(void);
+void	runtime·exitsyscall(void);
+G*	runtime·newproc1(FuncVal*, byte*, int32, int32, void*);
+bool	runtime·sigsend(int32 sig);
+intgo	runtime·callers(intgo, uintptr*, intgo);
+intgo	runtime·gcallers(G*, intgo, uintptr*, intgo);
+int64	runtime·nanotime(void);	// monotonic time
+int64	runtime·unixnanotime(void); // real time, can skip
+void	runtime·dopanic(int32);
+void	runtime·startpanic(void);
+void	runtime·freezetheworld(void);
+void	runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp, M *mp);
+void	runtime·resetcpuprofiler(int32);
+void	runtime·setcpuprofilerate(int32);
+void	runtime·usleep(uint32);
+int64	runtime·cputicks(void);
+int64	runtime·tickspersecond(void);
+void	runtime·blockevent(int64, intgo);
+G*	runtime·netpoll(bool);
+void	runtime·netpollready(G**, PollDesc*, int32);
+uintptr	runtime·netpollfd(PollDesc*);
+void**	runtime·netpolluser(PollDesc*);
+bool	runtime·netpollclosing(PollDesc*);
+void	runtime·netpolllock(PollDesc*);
+void	runtime·netpollunlock(PollDesc*);
+void	runtime·crash(void);
+void	runtime·parsedebugvars(void);
+void*	runtime·funcdata(Func*, int32);
+void	runtime·setmaxthreads_m(void);
+G*	runtime·timejump(void);
+void	runtime·iterate_itabs(void (**callback)(Itab*));
+void	runtime·iterate_finq(void (*callback)(FuncVal*, byte*, uintptr, Type*, PtrType*));
+
+#pragma	varargck	argpos	runtime·printf	1
+#pragma	varargck	type	"c"	int32
+#pragma	varargck	type	"d"	int32
+#pragma	varargck	type	"d"	uint32
+#pragma	varargck	type	"D"	int64
+#pragma	varargck	type	"D"	uint64
+#pragma	varargck	type	"x"	int32
+#pragma	varargck	type	"x"	uint32
+#pragma	varargck	type	"X"	int64
+#pragma	varargck	type	"X"	uint64
+#pragma	varargck	type	"p"	void*
+#pragma	varargck	type	"p"	uintptr
+#pragma	varargck	type	"s"	int8*
+#pragma	varargck	type	"s"	uint8*
+#pragma	varargck	type	"S"	String
+
+void	runtime·stoptheworld(void);
+void	runtime·starttheworld(void);
+extern uint32 runtime·worldsema;
+
+/*
+ * mutual exclusion locks.  in the uncontended case,
+ * as fast as spin locks (just a few user-level instructions),
+ * but on the contention path they sleep in the kernel.
+ * a zeroed Mutex is unlocked (no need to initialize each lock).
+ */
+void	runtime·lock(Mutex*);
+void	runtime·unlock(Mutex*);
+
+/*
+ * sleep and wakeup on one-time events.
+ * before any calls to notesleep or notewakeup,
+ * must call noteclear to initialize the Note.
+ * then, exactly one thread can call notesleep
+ * and exactly one thread can call notewakeup (once).
+ * once notewakeup has been called, the notesleep
+ * will return.  future notesleep will return immediately.
+ * subsequent noteclear must be called only after
+ * previous notesleep has returned, e.g. it's disallowed
+ * to call noteclear straight after notewakeup.
+ *
+ * notetsleep is like notesleep but wakes up after
+ * a given number of nanoseconds even if the event
+ * has not yet happened.  if a goroutine uses notetsleep to
+ * wake up early, it must wait to call noteclear until it
+ * can be sure that no other goroutine is calling
+ * notewakeup.
+ *
+ * notesleep/notetsleep are generally called on g0,
+ * notetsleepg is similar to notetsleep but is called on user g.
+ */
+void	runtime·noteclear(Note*);
+void	runtime·notesleep(Note*);
+void	runtime·notewakeup(Note*);
+bool	runtime·notetsleep(Note*, int64);  // false - timeout
+bool	runtime·notetsleepg(Note*, int64);  // false - timeout
+
+/*
+ * low-level synchronization for implementing the above
+ */
+uintptr	runtime·semacreate(void);
+int32	runtime·semasleep(int64);
+void	runtime·semawakeup(M*);
+// or
+void	runtime·futexsleep(uint32*, uint32, int64);
+void	runtime·futexwakeup(uint32*, uint32);
+
+/*
+ * Mutex-free stack.
+ * Initialize uint64 head to 0, compare with 0 to test for emptiness.
+ * The stack does not keep pointers to nodes,
+ * so they can be garbage collected if there are no other pointers to nodes.
+ */
+void	runtime·lfstackpush(uint64 *head, LFNode *node);
+LFNode*	runtime·lfstackpop(uint64 *head);
+
+/*
+ * Parallel for over [0, n).
+ * body() is executed for each iteration.
+ * nthr - total number of worker threads.
+ * ctx - arbitrary user context.
+ * if wait=true, threads return from parfor() when all work is done;
+ * otherwise, threads can return while other threads are still finishing processing.
+ */
+ParFor*	runtime·parforalloc(uint32 nthrmax);
+void	runtime·parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32));
+void	runtime·parfordo(ParFor *desc);
+void	runtime·parforiters(ParFor*, uintptr, uintptr*, uintptr*);
+
+/*
+ * low level C-called
+ */
+// for mmap, we only pass the lower 32 bits of file offset to the 
+// assembly routine; the higher bits (if required), should be provided
+// by the assembly routine as 0.
+uint8*	runtime·mmap(byte*, uintptr, int32, int32, int32, uint32);
+void	runtime·munmap(byte*, uintptr);
+void	runtime·madvise(byte*, uintptr, int32);
+void	runtime·memclr(byte*, uintptr);
+void	runtime·setcallerpc(void*, void*);
+void*	runtime·getcallerpc(void*);
+void	runtime·printbool(bool);
+void	runtime·printbyte(int8);
+void	runtime·printfloat(float64);
+void	runtime·printint(int64);
+void	runtime·printiface(Iface);
+void	runtime·printeface(Eface);
+void	runtime·printstring(String);
+void	runtime·printpc(void*);
+void	runtime·printpointer(void*);
+void	runtime·printuint(uint64);
+void	runtime·printhex(uint64);
+void	runtime·printslice(Slice);
+void	runtime·printcomplex(Complex128);
+
+/*
+ * runtime go-called
+ */
+void	runtime·gopanic(Eface);
+void	runtime·panicindex(void);
+void	runtime·panicslice(void);
+void	runtime·panicdivide(void);
+
+/*
+ * runtime c-called (but written in Go)
+ */
+void	runtime·printany(Eface);
+void	runtime·newTypeAssertionError(String*, String*, String*, String*, Eface*);
+void	runtime·fadd64c(uint64, uint64, uint64*);
+void	runtime·fsub64c(uint64, uint64, uint64*);
+void	runtime·fmul64c(uint64, uint64, uint64*);
+void	runtime·fdiv64c(uint64, uint64, uint64*);
+void	runtime·fneg64c(uint64, uint64*);
+void	runtime·f32to64c(uint32, uint64*);
+void	runtime·f64to32c(uint64, uint32*);
+void	runtime·fcmp64c(uint64, uint64, int32*, bool*);
+void	runtime·fintto64c(int64, uint64*);
+void	runtime·f64tointc(uint64, int64*, bool*);
+
+/*
+ * wrapped for go users
+ */
+float64	runtime·Inf(int32 sign);
+float64	runtime·NaN(void);
+float32	runtime·float32frombits(uint32 i);
+uint32	runtime·float32tobits(float32 f);
+float64	runtime·float64frombits(uint64 i);
+uint64	runtime·float64tobits(float64 f);
+float64	runtime·frexp(float64 d, int32 *ep);
+bool	runtime·isInf(float64 f, int32 sign);
+bool	runtime·isNaN(float64 f);
+float64	runtime·ldexp(float64 d, int32 e);
+float64	runtime·modf(float64 d, float64 *ip);
+void	runtime·semacquire(uint32*, bool);
+void	runtime·semrelease(uint32*);
+int32	runtime·gomaxprocsfunc(int32 n);
+void	runtime·procyield(uint32);
+void	runtime·osyield(void);
+void	runtime·lockOSThread(void);
+void	runtime·unlockOSThread(void);
+
+bool	runtime·showframe(Func*, G*);
+void	runtime·printcreatedby(G*);
+
+void	runtime·ifaceE2I(InterfaceType*, Eface, Iface*);
+bool	runtime·ifaceE2I2(InterfaceType*, Eface, Iface*);
+uintptr	runtime·memlimit(void);
+
+// float.c
+extern float64 runtime·nan;
+extern float64 runtime·posinf;
+extern float64 runtime·neginf;
+extern uint64 ·nan;
+extern uint64 ·posinf;
+extern uint64 ·neginf;
+#define ISNAN(f) ((f) != (f))
+
+enum
+{
+	UseSpanType = 1,
+};
diff --git a/src/runtime/runtime_linux_test.go b/src/runtime/runtime_linux_test.go
new file mode 100644
index 000000000..5344ed205
--- /dev/null
+++ b/src/runtime/runtime_linux_test.go
@@ -0,0 +1,29 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"syscall"
+	"testing"
+)
+
+var pid, tid int
+
+func init() {
+	// Record pid and tid of init thread for use during test.
+	// The call to LockOSThread is just to exercise it;
+	// we can't test that it does anything.
+	// Instead we're testing that the conditions are good
+	// for how it is used in init (must be on main thread).
+	pid, tid = syscall.Getpid(), syscall.Gettid()
+	LockOSThread()
+}
+
+func TestLockOSThread(t *testing.T) {
+	if pid != tid {
+		t.Fatalf("pid=%d but tid=%d", pid, tid)
+	}
+}
diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
new file mode 100644
index 000000000..1688364a8
--- /dev/null
+++ b/src/runtime/runtime_test.go
@@ -0,0 +1,249 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"io"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	. "runtime"
+	"runtime/debug"
+	"strconv"
+	"strings"
+	"testing"
+	"unsafe"
+)
+
+var errf error
+
+func errfn() error {
+	return errf
+}
+
+func errfn1() error {
+	return io.EOF
+}
+
+func BenchmarkIfaceCmp100(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 100; j++ {
+			if errfn() == io.EOF {
+				b.Fatal("bad comparison")
+			}
+		}
+	}
+}
+
+func BenchmarkIfaceCmpNil100(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 100; j++ {
+			if errfn1() == nil {
+				b.Fatal("bad comparison")
+			}
+		}
+	}
+}
+
+func BenchmarkDefer(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		defer1()
+	}
+}
+
+func defer1() {
+	defer func(x, y, z int) {
+		if recover() != nil || x != 1 || y != 2 || z != 3 {
+			panic("bad recover")
+		}
+	}(1, 2, 3)
+	return
+}
+
+func BenchmarkDefer10(b *testing.B) {
+	for i := 0; i < b.N/10; i++ {
+		defer2()
+	}
+}
+
+func defer2() {
+	for i := 0; i < 10; i++ {
+		defer func(x, y, z int) {
+			if recover() != nil || x != 1 || y != 2 || z != 3 {
+				panic("bad recover")
+			}
+		}(1, 2, 3)
+	}
+}
+
+func BenchmarkDeferMany(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		defer func(x, y, z int) {
+			if recover() != nil || x != 1 || y != 2 || z != 3 {
+				panic("bad recover")
+			}
+		}(1, 2, 3)
+	}
+}
+
+// The profiling signal handler needs to know whether it is executing runtime.gogo.
+// The constant RuntimeGogoBytes in arch_*.h gives the size of the function;
+// we don't have a way to obtain it from the linker (perhaps someday).
+// Test that the constant matches the size determined by 'go tool nm -S'.
+// The value reported will include the padding between runtime.gogo and the
+// next function in memory. That's fine.
+func TestRuntimeGogoBytes(t *testing.T) {
+	switch GOOS {
+	case "android", "nacl":
+		t.Skipf("skipping on %s", GOOS)
+	}
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	out, err := exec.Command("go", "build", "-o", dir+"/hello", "../../test/helloworld.go").CombinedOutput()
+	if err != nil {
+		t.Fatalf("building hello world: %v\n%s", err, out)
+	}
+
+	out, err = exec.Command("go", "tool", "nm", "-size", dir+"/hello").CombinedOutput()
+	if err != nil {
+		t.Fatalf("go tool nm: %v\n%s", err, out)
+	}
+
+	for _, line := range strings.Split(string(out), "\n") {
+		f := strings.Fields(line)
+		if len(f) == 4 && f[3] == "runtime.gogo" {
+			size, _ := strconv.Atoi(f[1])
+			if GogoBytes() != int32(size) {
+				t.Fatalf("RuntimeGogoBytes = %d, should be %d", GogoBytes(), size)
+			}
+			return
+		}
+	}
+
+	t.Fatalf("go tool nm did not report size for runtime.gogo")
+}
+
+// golang.org/issue/7063
+func TestStopCPUProfilingWithProfilerOff(t *testing.T) {
+	SetCPUProfileRate(0)
+}
+
+// Addresses to test for faulting behavior.
+// This is less a test of SetPanicOnFault and more a check that
+// the operating system and the runtime can process these faults
+// correctly. That is, we're indirectly testing that without SetPanicOnFault
+// these would manage to turn into ordinary crashes.
+// Note that these are truncated on 32-bit systems, so the bottom 32 bits
+// of the larger addresses must themselves be invalid addresses.
+// We might get unlucky and the OS might have mapped one of these
+// addresses, but probably not: they're all in the first page, very high
+// adderesses that normally an OS would reserve for itself, or malformed
+// addresses. Even so, we might have to remove one or two on different
+// systems. We will see.
+
+var faultAddrs = []uint64{
+	// low addresses
+	0,
+	1,
+	0xfff,
+	// high (kernel) addresses
+	// or else malformed.
+	0xffffffffffffffff,
+	0xfffffffffffff001,
+	0xffffffffffff0001,
+	0xfffffffffff00001,
+	0xffffffffff000001,
+	0xfffffffff0000001,
+	0xffffffff00000001,
+	0xfffffff000000001,
+	0xffffff0000000001,
+	0xfffff00000000001,
+	0xffff000000000001,
+	0xfff0000000000001,
+	0xff00000000000001,
+	0xf000000000000001,
+	0x8000000000000001,
+}
+
+func TestSetPanicOnFault(t *testing.T) {
+	// This currently results in a fault in the signal trampoline on
+	// dragonfly/386 - see issue 7421.
+	if GOOS == "dragonfly" && GOARCH == "386" {
+		t.Skip("skipping test on dragonfly/386")
+	}
+
+	old := debug.SetPanicOnFault(true)
+	defer debug.SetPanicOnFault(old)
+
+	nfault := 0
+	for _, addr := range faultAddrs {
+		testSetPanicOnFault(t, uintptr(addr), &nfault)
+	}
+	if nfault == 0 {
+		t.Fatalf("none of the addresses faulted")
+	}
+}
+
+func testSetPanicOnFault(t *testing.T, addr uintptr, nfault *int) {
+	if GOOS == "nacl" {
+		t.Skip("nacl doesn't seem to fault on high addresses")
+	}
+
+	defer func() {
+		if err := recover(); err != nil {
+			*nfault++
+		}
+	}()
+
+	// The read should fault, except that sometimes we hit
+	// addresses that have had C or kernel pages mapped there
+	// readable by user code. So just log the content.
+	// If no addresses fault, we'll fail the test.
+	v := *(*byte)(unsafe.Pointer(addr))
+	t.Logf("addr %#x: %#x\n", addr, v)
+}
+
+func eqstring_generic(s1, s2 string) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	// optimization in assembly versions:
+	// if s1.str == s2.str { return true }
+	for i := 0; i < len(s1); i++ {
+		if s1[i] != s2[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestEqString(t *testing.T) {
+	// This isn't really an exhaustive test of eqstring, it's
+	// just a convenient way of documenting (via eqstring_generic)
+	// what eqstring does.
+	s := []string{
+		"",
+		"a",
+		"c",
+		"aaa",
+		"ccc",
+		"cccc"[:3], // same contents, different string
+		"1234567890",
+	}
+	for _, s1 := range s {
+		for _, s2 := range s {
+			x := s1 == s2
+			y := eqstring_generic(s1, s2)
+			if x != y {
+				t.Errorf(`eqstring("%s","%s") = %t, want %t`, s1, s2, x, y)
+			}
+		}
+	}
+}
diff --git a/src/runtime/runtime_unix_test.go b/src/runtime/runtime_unix_test.go
new file mode 100644
index 000000000..963de8cdb
--- /dev/null
+++ b/src/runtime/runtime_unix_test.go
@@ -0,0 +1,56 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Only works on systems with syscall.Close.
+// We need a fast system call to provoke the race,
+// and Close(-1) is nearly universally fast.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd plan9
+
+package runtime_test
+
+import (
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"testing"
+)
+
+func TestGoroutineProfile(t *testing.T) {
+	// GoroutineProfile used to use the wrong starting sp for
+	// goroutines coming out of system calls, causing possible
+	// crashes.
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(100))
+
+	var stop uint32
+	defer atomic.StoreUint32(&stop, 1) // in case of panic
+
+	var wg sync.WaitGroup
+	for i := 0; i < 4; i++ {
+		wg.Add(1)
+		go func() {
+			for atomic.LoadUint32(&stop) == 0 {
+				syscall.Close(-1)
+			}
+			wg.Done()
+		}()
+	}
+
+	max := 10000
+	if testing.Short() {
+		max = 100
+	}
+	stk := make([]runtime.StackRecord, 100)
+	for n := 0; n < max; n++ {
+		_, ok := runtime.GoroutineProfile(stk)
+		if !ok {
+			t.Fatalf("GoroutineProfile failed")
+		}
+	}
+
+	// If the program didn't crash, we passed.
+	atomic.StoreUint32(&stop, 1)
+	wg.Wait()
+}
diff --git a/src/runtime/select.go b/src/runtime/select.go
new file mode 100644
index 000000000..f735a71e2
--- /dev/null
+++ b/src/runtime/select.go
@@ -0,0 +1,651 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// This file contains the implementation of Go select statements.
+
+import "unsafe"
+
+const (
+	debugSelect = false
+)
+
+var (
+	chansendpc = funcPC(chansend)
+	chanrecvpc = funcPC(chanrecv)
+)
+
+func selectsize(size uintptr) uintptr {
+	selsize := unsafe.Sizeof(_select{}) +
+		(size-1)*unsafe.Sizeof(_select{}.scase[0]) +
+		size*unsafe.Sizeof(*_select{}.lockorder) +
+		size*unsafe.Sizeof(*_select{}.pollorder)
+	return round(selsize, _Int64Align)
+}
+
+func newselect(sel *_select, selsize int64, size int32) {
+	if selsize != int64(selectsize(uintptr(size))) {
+		print("runtime: bad select size ", selsize, ", want ", selectsize(uintptr(size)), "\n")
+		gothrow("bad select size")
+	}
+	sel.tcase = uint16(size)
+	sel.ncase = 0
+	sel.lockorder = (**hchan)(add(unsafe.Pointer(&sel.scase), uintptr(size)*unsafe.Sizeof(_select{}.scase[0])))
+	sel.pollorder = (*uint16)(add(unsafe.Pointer(sel.lockorder), uintptr(size)*unsafe.Sizeof(*_select{}.lockorder)))
+
+	if debugSelect {
+		print("newselect s=", sel, " size=", size, "\n")
+	}
+}
+
+//go:nosplit
+func selectsend(sel *_select, c *hchan, elem unsafe.Pointer) (selected bool) {
+	// nil cases do not compete
+	if c != nil {
+		selectsendImpl(sel, c, getcallerpc(unsafe.Pointer(&sel)), elem, uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
+	}
+	return
+}
+
+// cut in half to give stack a chance to split
+func selectsendImpl(sel *_select, c *hchan, pc uintptr, elem unsafe.Pointer, so uintptr) {
+	i := sel.ncase
+	if i >= sel.tcase {
+		gothrow("selectsend: too many cases")
+	}
+	sel.ncase = i + 1
+	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
+
+	cas.pc = pc
+	cas._chan = c
+	cas.so = uint16(so)
+	cas.kind = _CaseSend
+	cas.elem = elem
+
+	if debugSelect {
+		print("selectsend s=", sel, " pc=", hex(cas.pc), " chan=", cas._chan, " so=", cas.so, "\n")
+	}
+}
+
+//go:nosplit
+func selectrecv(sel *_select, c *hchan, elem unsafe.Pointer) (selected bool) {
+	// nil cases do not compete
+	if c != nil {
+		selectrecvImpl(sel, c, getcallerpc(unsafe.Pointer(&sel)), elem, nil, uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
+	}
+	return
+}
+
+//go:nosplit
+func selectrecv2(sel *_select, c *hchan, elem unsafe.Pointer, received *bool) (selected bool) {
+	// nil cases do not compete
+	if c != nil {
+		selectrecvImpl(sel, c, getcallerpc(unsafe.Pointer(&sel)), elem, received, uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
+	}
+	return
+}
+
+func selectrecvImpl(sel *_select, c *hchan, pc uintptr, elem unsafe.Pointer, received *bool, so uintptr) {
+	i := sel.ncase
+	if i >= sel.tcase {
+		gothrow("selectrecv: too many cases")
+	}
+	sel.ncase = i + 1
+	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
+	cas.pc = pc
+	cas._chan = c
+	cas.so = uint16(so)
+	cas.kind = _CaseRecv
+	cas.elem = elem
+	cas.receivedp = received
+
+	if debugSelect {
+		print("selectrecv s=", sel, " pc=", hex(cas.pc), " chan=", cas._chan, " so=", cas.so, "\n")
+	}
+}
+
+//go:nosplit
+func selectdefault(sel *_select) (selected bool) {
+	selectdefaultImpl(sel, getcallerpc(unsafe.Pointer(&sel)), uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
+	return
+}
+
+func selectdefaultImpl(sel *_select, callerpc uintptr, so uintptr) {
+	i := sel.ncase
+	if i >= sel.tcase {
+		gothrow("selectdefault: too many cases")
+	}
+	sel.ncase = i + 1
+	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
+	cas.pc = callerpc
+	cas._chan = nil
+	cas.so = uint16(so)
+	cas.kind = _CaseDefault
+
+	if debugSelect {
+		print("selectdefault s=", sel, " pc=", hex(cas.pc), " so=", cas.so, "\n")
+	}
+}
+
+func sellock(sel *_select) {
+	lockslice := sliceStruct{unsafe.Pointer(sel.lockorder), int(sel.ncase), int(sel.ncase)}
+	lockorder := *(*[]*hchan)(unsafe.Pointer(&lockslice))
+	var c *hchan
+	for _, c0 := range lockorder {
+		if c0 != nil && c0 != c {
+			c = c0
+			lock(&c.lock)
+		}
+	}
+}
+
+func selunlock(sel *_select) {
+	// We must be very careful here to not touch sel after we have unlocked
+	// the last lock, because sel can be freed right after the last unlock.
+	// Consider the following situation.
+	// First M calls runtime·park() in runtime·selectgo() passing the sel.
+	// Once runtime·park() has unlocked the last lock, another M makes
+	// the G that calls select runnable again and schedules it for execution.
+	// When the G runs on another M, it locks all the locks and frees sel.
+	// Now if the first M touches sel, it will access freed memory.
+	n := int(sel.ncase)
+	r := 0
+	lockslice := sliceStruct{unsafe.Pointer(sel.lockorder), n, n}
+	lockorder := *(*[]*hchan)(unsafe.Pointer(&lockslice))
+	// skip the default case
+	if n > 0 && lockorder[0] == nil {
+		r = 1
+	}
+	for i := n - 1; i >= r; i-- {
+		c := lockorder[i]
+		if i > 0 && c == lockorder[i-1] {
+			continue // will unlock it on the next iteration
+		}
+		unlock(&c.lock)
+	}
+}
+
+func selparkcommit(gp *g, sel *_select) bool {
+	selunlock(sel)
+	return true
+}
+
+func block() {
+	gopark(nil, nil, "select (no cases)") // forever
+}
+
+// overwrites return pc on stack to signal which case of the select
+// to run, so cannot appear at the top of a split stack.
+//go:nosplit
+func selectgo(sel *_select) {
+	pc, offset := selectgoImpl(sel)
+	*(*bool)(add(unsafe.Pointer(&sel), uintptr(offset))) = true
+	setcallerpc(unsafe.Pointer(&sel), pc)
+}
+
+// selectgoImpl returns scase.pc and scase.so for the select
+// case which fired.
+func selectgoImpl(sel *_select) (uintptr, uint16) {
+	if debugSelect {
+		print("select: sel=", sel, "\n")
+	}
+
+	scaseslice := sliceStruct{unsafe.Pointer(&sel.scase), int(sel.ncase), int(sel.ncase)}
+	scases := *(*[]scase)(unsafe.Pointer(&scaseslice))
+
+	var t0 int64
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+		for i := 0; i < int(sel.ncase); i++ {
+			scases[i].releasetime = -1
+		}
+	}
+
+	// The compiler rewrites selects that statically have
+	// only 0 or 1 cases plus default into simpler constructs.
+	// The only way we can end up with such small sel.ncase
+	// values here is for a larger select in which most channels
+	// have been nilled out.  The general code handles those
+	// cases correctly, and they are rare enough not to bother
+	// optimizing (and needing to test).
+
+	// generate permuted order
+	pollslice := sliceStruct{unsafe.Pointer(sel.pollorder), int(sel.ncase), int(sel.ncase)}
+	pollorder := *(*[]uint16)(unsafe.Pointer(&pollslice))
+	for i := 0; i < int(sel.ncase); i++ {
+		pollorder[i] = uint16(i)
+	}
+	for i := 1; i < int(sel.ncase); i++ {
+		o := pollorder[i]
+		j := int(fastrand1()) % (i + 1)
+		pollorder[i] = pollorder[j]
+		pollorder[j] = o
+	}
+
+	// sort the cases by Hchan address to get the locking order.
+	// simple heap sort, to guarantee n log n time and constant stack footprint.
+	lockslice := sliceStruct{unsafe.Pointer(sel.lockorder), int(sel.ncase), int(sel.ncase)}
+	lockorder := *(*[]*hchan)(unsafe.Pointer(&lockslice))
+	for i := 0; i < int(sel.ncase); i++ {
+		j := i
+		c := scases[j]._chan
+		for j > 0 && lockorder[(j-1)/2].sortkey() < c.sortkey() {
+			k := (j - 1) / 2
+			lockorder[j] = lockorder[k]
+			j = k
+		}
+		lockorder[j] = c
+	}
+	for i := int(sel.ncase) - 1; i >= 0; i-- {
+		c := lockorder[i]
+		lockorder[i] = lockorder[0]
+		j := 0
+		for {
+			k := j*2 + 1
+			if k >= i {
+				break
+			}
+			if k+1 < i && lockorder[k].sortkey() < lockorder[k+1].sortkey() {
+				k++
+			}
+			if c.sortkey() < lockorder[k].sortkey() {
+				lockorder[j] = lockorder[k]
+				j = k
+				continue
+			}
+			break
+		}
+		lockorder[j] = c
+	}
+	/*
+		for i := 0; i+1 < int(sel.ncase); i++ {
+			if lockorder[i].sortkey() > lockorder[i+1].sortkey() {
+				print("i=", i, " x=", lockorder[i], " y=", lockorder[i+1], "\n")
+				gothrow("select: broken sort")
+			}
+		}
+	*/
+
+	// lock all the channels involved in the select
+	sellock(sel)
+
+	var (
+		gp     *g
+		done   uint32
+		sg     *sudog
+		c      *hchan
+		k      *scase
+		sglist *sudog
+		sgnext *sudog
+	)
+
+loop:
+	// pass 1 - look for something already waiting
+	var dfl *scase
+	var cas *scase
+	for i := 0; i < int(sel.ncase); i++ {
+		cas = &scases[pollorder[i]]
+		c = cas._chan
+
+		switch cas.kind {
+		case _CaseRecv:
+			if c.dataqsiz > 0 {
+				if c.qcount > 0 {
+					goto asyncrecv
+				}
+			} else {
+				sg = c.sendq.dequeue()
+				if sg != nil {
+					goto syncrecv
+				}
+			}
+			if c.closed != 0 {
+				goto rclose
+			}
+
+		case _CaseSend:
+			if raceenabled {
+				racereadpc(unsafe.Pointer(c), cas.pc, chansendpc)
+			}
+			if c.closed != 0 {
+				goto sclose
+			}
+			if c.dataqsiz > 0 {
+				if c.qcount < c.dataqsiz {
+					goto asyncsend
+				}
+			} else {
+				sg = c.recvq.dequeue()
+				if sg != nil {
+					goto syncsend
+				}
+			}
+
+		case _CaseDefault:
+			dfl = cas
+		}
+	}
+
+	if dfl != nil {
+		selunlock(sel)
+		cas = dfl
+		goto retc
+	}
+
+	// pass 2 - enqueue on all chans
+	gp = getg()
+	done = 0
+	for i := 0; i < int(sel.ncase); i++ {
+		cas = &scases[pollorder[i]]
+		c = cas._chan
+		sg := acquireSudog()
+		sg.g = gp
+		// Note: selectdone is adjusted for stack copies in stack.c:adjustsudogs
+		sg.selectdone = (*uint32)(noescape(unsafe.Pointer(&done)))
+		sg.elem = cas.elem
+		sg.releasetime = 0
+		if t0 != 0 {
+			sg.releasetime = -1
+		}
+		sg.waitlink = gp.waiting
+		gp.waiting = sg
+
+		switch cas.kind {
+		case _CaseRecv:
+			c.recvq.enqueue(sg)
+
+		case _CaseSend:
+			c.sendq.enqueue(sg)
+		}
+	}
+
+	// wait for someone to wake us up
+	gp.param = nil
+	gopark(unsafe.Pointer(funcPC(selparkcommit)), unsafe.Pointer(sel), "select")
+
+	// someone woke us up
+	sellock(sel)
+	sg = (*sudog)(gp.param)
+	gp.param = nil
+
+	// pass 3 - dequeue from unsuccessful chans
+	// otherwise they stack up on quiet channels
+	// record the successful case, if any.
+	// We singly-linked up the SudoGs in case order, so when
+	// iterating through the linked list they are in reverse order.
+	cas = nil
+	sglist = gp.waiting
+	// Clear all selectdone and elem before unlinking from gp.waiting.
+	// They must be cleared before being put back into the sudog cache.
+	// Clear before unlinking, because if a stack copy happens after the unlink,
+	// they will not be updated, they will be left pointing to the old stack,
+	// which creates dangling pointers, which may be detected by the
+	// garbage collector.
+	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
+		sg1.selectdone = nil
+		sg1.elem = nil
+	}
+	gp.waiting = nil
+	for i := int(sel.ncase) - 1; i >= 0; i-- {
+		k = &scases[pollorder[i]]
+		if sglist.releasetime > 0 {
+			k.releasetime = sglist.releasetime
+		}
+		if sg == sglist {
+			cas = k
+		} else {
+			c = k._chan
+			if k.kind == _CaseSend {
+				c.sendq.dequeueSudoG(sglist)
+			} else {
+				c.recvq.dequeueSudoG(sglist)
+			}
+		}
+		sgnext = sglist.waitlink
+		sglist.waitlink = nil
+		releaseSudog(sglist)
+		sglist = sgnext
+	}
+
+	if cas == nil {
+		goto loop
+	}
+
+	c = cas._chan
+
+	if c.dataqsiz > 0 {
+		gothrow("selectgo: shouldn't happen")
+	}
+
+	if debugSelect {
+		print("wait-return: sel=", sel, " c=", c, " cas=", cas, " kind=", cas.kind, "\n")
+	}
+
+	if cas.kind == _CaseRecv {
+		if cas.receivedp != nil {
+			*cas.receivedp = true
+		}
+	}
+
+	if raceenabled {
+		if cas.kind == _CaseRecv && cas.elem != nil {
+			raceWriteObjectPC(c.elemtype, cas.elem, cas.pc, chanrecvpc)
+		} else if cas.kind == _CaseSend {
+			raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
+		}
+	}
+
+	selunlock(sel)
+	goto retc
+
+asyncrecv:
+	// can receive from buffer
+	if raceenabled {
+		if cas.elem != nil {
+			raceWriteObjectPC(c.elemtype, cas.elem, cas.pc, chanrecvpc)
+		}
+		raceacquire(chanbuf(c, c.recvx))
+		racerelease(chanbuf(c, c.recvx))
+	}
+	if cas.receivedp != nil {
+		*cas.receivedp = true
+	}
+	if cas.elem != nil {
+		memmove(cas.elem, chanbuf(c, c.recvx), uintptr(c.elemsize))
+	}
+	memclr(chanbuf(c, c.recvx), uintptr(c.elemsize))
+	c.recvx++
+	if c.recvx == c.dataqsiz {
+		c.recvx = 0
+	}
+	c.qcount--
+	sg = c.sendq.dequeue()
+	if sg != nil {
+		gp = sg.g
+		selunlock(sel)
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		goready(gp)
+	} else {
+		selunlock(sel)
+	}
+	goto retc
+
+asyncsend:
+	// can send to buffer
+	if raceenabled {
+		raceacquire(chanbuf(c, c.sendx))
+		racerelease(chanbuf(c, c.sendx))
+		raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
+	}
+	memmove(chanbuf(c, c.sendx), cas.elem, uintptr(c.elemsize))
+	c.sendx++
+	if c.sendx == c.dataqsiz {
+		c.sendx = 0
+	}
+	c.qcount++
+	sg = c.recvq.dequeue()
+	if sg != nil {
+		gp = sg.g
+		selunlock(sel)
+		if sg.releasetime != 0 {
+			sg.releasetime = cputicks()
+		}
+		goready(gp)
+	} else {
+		selunlock(sel)
+	}
+	goto retc
+
+syncrecv:
+	// can receive from sleeping sender (sg)
+	if raceenabled {
+		if cas.elem != nil {
+			raceWriteObjectPC(c.elemtype, cas.elem, cas.pc, chanrecvpc)
+		}
+		racesync(c, sg)
+	}
+	selunlock(sel)
+	if debugSelect {
+		print("syncrecv: sel=", sel, " c=", c, "\n")
+	}
+	if cas.receivedp != nil {
+		*cas.receivedp = true
+	}
+	if cas.elem != nil {
+		memmove(cas.elem, sg.elem, uintptr(c.elemsize))
+	}
+	sg.elem = nil
+	gp = sg.g
+	gp.param = unsafe.Pointer(sg)
+	if sg.releasetime != 0 {
+		sg.releasetime = cputicks()
+	}
+	goready(gp)
+	goto retc
+
+rclose:
+	// read at end of closed channel
+	selunlock(sel)
+	if cas.receivedp != nil {
+		*cas.receivedp = false
+	}
+	if cas.elem != nil {
+		memclr(cas.elem, uintptr(c.elemsize))
+	}
+	if raceenabled {
+		raceacquire(unsafe.Pointer(c))
+	}
+	goto retc
+
+syncsend:
+	// can send to sleeping receiver (sg)
+	if raceenabled {
+		raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
+		racesync(c, sg)
+	}
+	selunlock(sel)
+	if debugSelect {
+		print("syncsend: sel=", sel, " c=", c, "\n")
+	}
+	if sg.elem != nil {
+		memmove(sg.elem, cas.elem, uintptr(c.elemsize))
+	}
+	sg.elem = nil
+	gp = sg.g
+	gp.param = unsafe.Pointer(sg)
+	if sg.releasetime != 0 {
+		sg.releasetime = cputicks()
+	}
+	goready(gp)
+
+retc:
+	if cas.releasetime > 0 {
+		blockevent(cas.releasetime-t0, 2)
+	}
+	return cas.pc, cas.so
+
+sclose:
+	// send on closed channel
+	selunlock(sel)
+	panic("send on closed channel")
+}
+
+func (c *hchan) sortkey() uintptr {
+	// TODO(khr): if we have a moving garbage collector, we'll need to
+	// change this function.
+	return uintptr(unsafe.Pointer(c))
+}
+
+// A runtimeSelect is a single case passed to rselect.
+// This must match ../reflect/value.go:/runtimeSelect
+type runtimeSelect struct {
+	dir selectDir
+	typ unsafe.Pointer // channel type (not used here)
+	ch  *hchan         // channel
+	val unsafe.Pointer // ptr to data (SendDir) or ptr to receive buffer (RecvDir)
+}
+
+// These values must match ../reflect/value.go:/SelectDir.
+type selectDir int
+
+const (
+	_             selectDir = iota
+	selectSend              // case Chan <- Send
+	selectRecv              // case <-Chan:
+	selectDefault           // default
+)
+
+func reflect_rselect(cases []runtimeSelect) (chosen int, recvOK bool) {
+	// flagNoScan is safe here, because all objects are also referenced from cases.
+	size := selectsize(uintptr(len(cases)))
+	sel := (*_select)(mallocgc(size, nil, flagNoScan))
+	newselect(sel, int64(size), int32(len(cases)))
+	r := new(bool)
+	for i := range cases {
+		rc := &cases[i]
+		switch rc.dir {
+		case selectDefault:
+			selectdefaultImpl(sel, uintptr(i), 0)
+		case selectSend:
+			if rc.ch == nil {
+				break
+			}
+			selectsendImpl(sel, rc.ch, uintptr(i), rc.val, 0)
+		case selectRecv:
+			if rc.ch == nil {
+				break
+			}
+			selectrecvImpl(sel, rc.ch, uintptr(i), rc.val, r, 0)
+		}
+	}
+
+	pc, _ := selectgoImpl(sel)
+	chosen = int(pc)
+	recvOK = *r
+	return
+}
+
+func (q *waitq) dequeueSudoG(s *sudog) {
+	var prevsgp *sudog
+	l := &q.first
+	for {
+		sgp := *l
+		if sgp == nil {
+			return
+		}
+		if sgp == s {
+			*l = sgp.next
+			if q.last == sgp {
+				q.last = prevsgp
+			}
+			s.next = nil
+			return
+		}
+		l = &sgp.next
+		prevsgp = sgp
+	}
+}
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
new file mode 100644
index 000000000..26dbd30ea
--- /dev/null
+++ b/src/runtime/sema.go
@@ -0,0 +1,275 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Semaphore implementation exposed to Go.
+// Intended use is provide a sleep and wakeup
+// primitive that can be used in the contended case
+// of other synchronization primitives.
+// Thus it targets the same goal as Linux's futex,
+// but it has much simpler semantics.
+//
+// That is, don't think of these as semaphores.
+// Think of them as a way to implement sleep and wakeup
+// such that every sleep is paired with a single wakeup,
+// even if, due to races, the wakeup happens before the sleep.
+//
+// See Mullender and Cox, ``Semaphores in Plan 9,''
+// http://swtch.com/semaphore.pdf
+
+package runtime
+
+import "unsafe"
+
+// Asynchronous semaphore for sync.Mutex.
+
+type semaRoot struct {
+	lock  mutex
+	head  *sudog
+	tail  *sudog
+	nwait uint32 // Number of waiters. Read w/o the lock.
+}
+
+// Prime to not correlate with any user patterns.
+const semTabSize = 251
+
+var semtable [semTabSize]struct {
+	root semaRoot
+	pad  [_CacheLineSize - unsafe.Sizeof(semaRoot{})]byte
+}
+
+// Called from sync/net packages.
+func asyncsemacquire(addr *uint32) {
+	semacquire(addr, true)
+}
+
+func asyncsemrelease(addr *uint32) {
+	semrelease(addr)
+}
+
+// Called from runtime.
+func semacquire(addr *uint32, profile bool) {
+	gp := getg()
+	if gp != gp.m.curg {
+		gothrow("semacquire not on the G stack")
+	}
+
+	// Easy case.
+	if cansemacquire(addr) {
+		return
+	}
+
+	// Harder case:
+	//	increment waiter count
+	//	try cansemacquire one more time, return if succeeded
+	//	enqueue itself as a waiter
+	//	sleep
+	//	(waiter descriptor is dequeued by signaler)
+	s := acquireSudog()
+	root := semroot(addr)
+	t0 := int64(0)
+	s.releasetime = 0
+	if profile && blockprofilerate > 0 {
+		t0 = cputicks()
+		s.releasetime = -1
+	}
+	for {
+		lock(&root.lock)
+		// Add ourselves to nwait to disable "easy case" in semrelease.
+		xadd(&root.nwait, 1)
+		// Check cansemacquire to avoid missed wakeup.
+		if cansemacquire(addr) {
+			xadd(&root.nwait, -1)
+			unlock(&root.lock)
+			break
+		}
+		// Any semrelease after the cansemacquire knows we're waiting
+		// (we set nwait above), so go to sleep.
+		root.queue(addr, s)
+		goparkunlock(&root.lock, "semacquire")
+		if cansemacquire(addr) {
+			break
+		}
+	}
+	if s.releasetime > 0 {
+		blockevent(int64(s.releasetime)-t0, 3)
+	}
+	releaseSudog(s)
+}
+
+func semrelease(addr *uint32) {
+	root := semroot(addr)
+	xadd(addr, 1)
+
+	// Easy case: no waiters?
+	// This check must happen after the xadd, to avoid a missed wakeup
+	// (see loop in semacquire).
+	if atomicload(&root.nwait) == 0 {
+		return
+	}
+
+	// Harder case: search for a waiter and wake it.
+	lock(&root.lock)
+	if atomicload(&root.nwait) == 0 {
+		// The count is already consumed by another goroutine,
+		// so no need to wake up another goroutine.
+		unlock(&root.lock)
+		return
+	}
+	s := root.head
+	for ; s != nil; s = s.next {
+		if s.elem == unsafe.Pointer(addr) {
+			xadd(&root.nwait, -1)
+			root.dequeue(s)
+			break
+		}
+	}
+	unlock(&root.lock)
+	if s != nil {
+		if s.releasetime != 0 {
+			s.releasetime = cputicks()
+		}
+		goready(s.g)
+	}
+}
+
+func semroot(addr *uint32) *semaRoot {
+	return &semtable[(uintptr(unsafe.Pointer(addr))>>3)%semTabSize].root
+}
+
+func cansemacquire(addr *uint32) bool {
+	for {
+		v := atomicload(addr)
+		if v == 0 {
+			return false
+		}
+		if cas(addr, v, v-1) {
+			return true
+		}
+	}
+}
+
+func (root *semaRoot) queue(addr *uint32, s *sudog) {
+	s.g = getg()
+	s.elem = unsafe.Pointer(addr)
+	s.next = nil
+	s.prev = root.tail
+	if root.tail != nil {
+		root.tail.next = s
+	} else {
+		root.head = s
+	}
+	root.tail = s
+}
+
+func (root *semaRoot) dequeue(s *sudog) {
+	if s.next != nil {
+		s.next.prev = s.prev
+	} else {
+		root.tail = s.prev
+	}
+	if s.prev != nil {
+		s.prev.next = s.next
+	} else {
+		root.head = s.next
+	}
+	s.elem = nil
+	s.next = nil
+	s.prev = nil
+}
+
+// Synchronous semaphore for sync.Cond.
+type syncSema struct {
+	lock mutex
+	head *sudog
+	tail *sudog
+}
+
+// Syncsemacquire waits for a pairing syncsemrelease on the same semaphore s.
+func syncsemacquire(s *syncSema) {
+	lock(&s.lock)
+	if s.head != nil && s.head.nrelease > 0 {
+		// Have pending release, consume it.
+		var wake *sudog
+		s.head.nrelease--
+		if s.head.nrelease == 0 {
+			wake = s.head
+			s.head = wake.next
+			if s.head == nil {
+				s.tail = nil
+			}
+		}
+		unlock(&s.lock)
+		if wake != nil {
+			wake.next = nil
+			goready(wake.g)
+		}
+	} else {
+		// Enqueue itself.
+		w := acquireSudog()
+		w.g = getg()
+		w.nrelease = -1
+		w.next = nil
+		w.releasetime = 0
+		t0 := int64(0)
+		if blockprofilerate > 0 {
+			t0 = cputicks()
+			w.releasetime = -1
+		}
+		if s.tail == nil {
+			s.head = w
+		} else {
+			s.tail.next = w
+		}
+		s.tail = w
+		goparkunlock(&s.lock, "semacquire")
+		if t0 != 0 {
+			blockevent(int64(w.releasetime)-t0, 2)
+		}
+		releaseSudog(w)
+	}
+}
+
+// Syncsemrelease waits for n pairing syncsemacquire on the same semaphore s.
+func syncsemrelease(s *syncSema, n uint32) {
+	lock(&s.lock)
+	for n > 0 && s.head != nil && s.head.nrelease < 0 {
+		// Have pending acquire, satisfy it.
+		wake := s.head
+		s.head = wake.next
+		if s.head == nil {
+			s.tail = nil
+		}
+		if wake.releasetime != 0 {
+			wake.releasetime = cputicks()
+		}
+		wake.next = nil
+		goready(wake.g)
+		n--
+	}
+	if n > 0 {
+		// enqueue itself
+		w := acquireSudog()
+		w.g = getg()
+		w.nrelease = int32(n)
+		w.next = nil
+		w.releasetime = 0
+		if s.tail == nil {
+			s.head = w
+		} else {
+			s.tail.next = w
+		}
+		s.tail = w
+		goparkunlock(&s.lock, "semarelease")
+		releaseSudog(w)
+	} else {
+		unlock(&s.lock)
+	}
+}
+
+func syncsemcheck(sz uintptr) {
+	if sz != unsafe.Sizeof(syncSema{}) {
+		print("runtime: bad syncSema size - sync=", sz, " runtime=", unsafe.Sizeof(syncSema{}), "\n")
+		gothrow("bad syncSema size")
+	}
+}
diff --git a/src/runtime/signal.c b/src/runtime/signal.c
new file mode 100644
index 000000000..0674bfb22
--- /dev/null
+++ b/src/runtime/signal.c
@@ -0,0 +1,25 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+void
+runtime·sigenable_m(void)
+{
+	uint32 s;
+	
+	s = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	runtime·sigenable(s);
+}
+
+void
+runtime·sigdisable_m(void)
+{
+	uint32 s;
+	
+	s = g->m->scalararg[0];
+	g->m->scalararg[0] = 0;
+	runtime·sigdisable(s);
+}
diff --git a/src/runtime/signal_386.c b/src/runtime/signal_386.c
new file mode 100644
index 000000000..30a7488bd
--- /dev/null
+++ b/src/runtime/signal_386.c
@@ -0,0 +1,122 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_GOOS_GOARCH.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Siginfo *info, void *ctxt)
+{
+	USED(info);
+	USED(ctxt);
+	
+	runtime·printf("eax     %x\n", SIG_EAX(info, ctxt));
+	runtime·printf("ebx     %x\n", SIG_EBX(info, ctxt));
+	runtime·printf("ecx     %x\n", SIG_ECX(info, ctxt));
+	runtime·printf("edx     %x\n", SIG_EDX(info, ctxt));
+	runtime·printf("edi     %x\n", SIG_EDI(info, ctxt));
+	runtime·printf("esi     %x\n", SIG_ESI(info, ctxt));
+	runtime·printf("ebp     %x\n", SIG_EBP(info, ctxt));
+	runtime·printf("esp     %x\n", SIG_ESP(info, ctxt));
+	runtime·printf("eip     %x\n", SIG_EIP(info, ctxt));
+	runtime·printf("eflags  %x\n", SIG_EFLAGS(info, ctxt));
+	runtime·printf("cs      %x\n", SIG_CS(info, ctxt));
+	runtime·printf("fs      %x\n", SIG_FS(info, ctxt));
+	runtime·printf("gs      %x\n", SIG_GS(info, ctxt));
+}
+
+void
+runtime·sighandler(int32 sig, Siginfo *info, void *ctxt, G *gp)
+{
+	uintptr *sp;
+	SigTab *t;
+	bool crash;
+
+	if(sig == SIGPROF) {
+		runtime·sigprof((byte*)SIG_EIP(info, ctxt), (byte*)SIG_ESP(info, ctxt), nil, gp, g->m);
+		return;
+	}
+
+	t = &runtime·sigtab[sig];
+	if(SIG_CODE0(info, ctxt) != SI_USER && (t->flags & SigPanic)) {
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp->sig = sig;
+		gp->sigcode0 = SIG_CODE0(info, ctxt);
+		gp->sigcode1 = SIG_CODE1(info, ctxt);
+		gp->sigpc = SIG_EIP(info, ctxt);
+
+#ifdef GOOS_darwin
+		// Work around Leopard bug that doesn't set FPE_INTDIV.
+		// Look at instruction to see if it is a divide.
+		// Not necessary in Snow Leopard (si_code will be != 0).
+		if(sig == SIGFPE && gp->sigcode0 == 0) {
+			byte *pc;
+			pc = (byte*)gp->sigpc;
+			if(pc[0] == 0x66)	// 16-bit instruction prefix
+				pc++;
+			if(pc[0] == 0xF6 || pc[0] == 0xF7)
+				gp->sigcode0 = FPE_INTDIV;
+		}
+#endif
+
+		// Only push runtime·sigpanic if eip != 0.
+		// If eip == 0, probably panicked because of a
+		// call to a nil func.  Not pushing that onto sp will
+		// make the trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic and we
+		// won't get to see who faulted.)
+		if(SIG_EIP(info, ctxt) != 0) {
+			sp = (uintptr*)SIG_ESP(info, ctxt);
+			*--sp = SIG_EIP(info, ctxt);
+			SIG_ESP(info, ctxt) = (uintptr)sp;
+		}
+		SIG_EIP(info, ctxt) = (uintptr)runtime·sigpanic;
+		return;
+	}
+
+	if(SIG_CODE0(info, ctxt) == SI_USER || (t->flags & SigNotify))
+		if(runtime·sigsend(sig))
+			return;
+	if(t->flags & SigKill)
+		runtime·exit(2);
+	if(!(t->flags & SigThrow))
+		return;
+
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
+	runtime·startpanic();
+
+	if(sig < 0 || sig >= NSIG)
+		runtime·printf("Signal %d\n", sig);
+	else
+		runtime·printf("%s\n", runtime·sigtab[sig].name);
+
+	runtime·printf("PC=%x\n", SIG_EIP(info, ctxt));
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = g->m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)){
+		runtime·goroutineheader(gp);
+		runtime·tracebacktrap(SIG_EIP(info, ctxt), SIG_ESP(info, ctxt), 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(info, ctxt);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+	runtime·exit(2);
+}
diff --git a/src/runtime/signal_amd64x.c b/src/runtime/signal_amd64x.c
new file mode 100644
index 000000000..feb4afcce
--- /dev/null
+++ b/src/runtime/signal_amd64x.c
@@ -0,0 +1,156 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_GOOS_GOARCH.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Siginfo *info, void *ctxt)
+{
+	USED(info);
+	USED(ctxt);
+	
+	runtime·printf("rax     %X\n", SIG_RAX(info, ctxt));
+	runtime·printf("rbx     %X\n", SIG_RBX(info, ctxt));
+	runtime·printf("rcx     %X\n", SIG_RCX(info, ctxt));
+	runtime·printf("rdx     %X\n", SIG_RDX(info, ctxt));
+	runtime·printf("rdi     %X\n", SIG_RDI(info, ctxt));
+	runtime·printf("rsi     %X\n", SIG_RSI(info, ctxt));
+	runtime·printf("rbp     %X\n", SIG_RBP(info, ctxt));
+	runtime·printf("rsp     %X\n", SIG_RSP(info, ctxt));
+	runtime·printf("r8      %X\n", SIG_R8(info, ctxt) );
+	runtime·printf("r9      %X\n", SIG_R9(info, ctxt) );
+	runtime·printf("r10     %X\n", SIG_R10(info, ctxt));
+	runtime·printf("r11     %X\n", SIG_R11(info, ctxt));
+	runtime·printf("r12     %X\n", SIG_R12(info, ctxt));
+	runtime·printf("r13     %X\n", SIG_R13(info, ctxt));
+	runtime·printf("r14     %X\n", SIG_R14(info, ctxt));
+	runtime·printf("r15     %X\n", SIG_R15(info, ctxt));
+	runtime·printf("rip     %X\n", SIG_RIP(info, ctxt));
+	runtime·printf("rflags  %X\n", SIG_RFLAGS(info, ctxt));
+	runtime·printf("cs      %X\n", SIG_CS(info, ctxt));
+	runtime·printf("fs      %X\n", SIG_FS(info, ctxt));
+	runtime·printf("gs      %X\n", SIG_GS(info, ctxt));
+}
+
+void
+runtime·sighandler(int32 sig, Siginfo *info, void *ctxt, G *gp)
+{
+	uintptr *sp;
+	SigTab *t;
+	bool crash;
+
+	if(sig == SIGPROF) {
+		runtime·sigprof((byte*)SIG_RIP(info, ctxt), (byte*)SIG_RSP(info, ctxt), nil, gp, g->m);
+		return;
+	}
+
+#ifdef GOOS_darwin
+	// x86-64 has 48-bit virtual addresses. The top 16 bits must echo bit 47.
+	// The hardware delivers a different kind of fault for a malformed address
+	// than it does for an attempt to access a valid but unmapped address.
+	// OS X 10.9.2 mishandles the malformed address case, making it look like
+	// a user-generated signal (like someone ran kill -SEGV ourpid).
+	// We pass user-generated signals to os/signal, or else ignore them.
+	// Doing that here - and returning to the faulting code - results in an
+	// infinite loop. It appears the best we can do is rewrite what the kernel
+	// delivers into something more like the truth. The address used below
+	// has very little chance of being the one that caused the fault, but it is
+	// malformed, it is clearly not a real pointer, and if it does get printed
+	// in real life, people will probably search for it and find this code.
+	// There are no Google hits for b01dfacedebac1e or 0xb01dfacedebac1e
+	// as I type this comment.
+	if(sig == SIGSEGV && SIG_CODE0(info, ctxt) == SI_USER) {
+		SIG_CODE0(info, ctxt) = SI_USER+1;
+		info->si_addr = (void*)(uintptr)0xb01dfacedebac1eULL;
+	}
+#endif
+
+	t = &runtime·sigtab[sig];
+	if(SIG_CODE0(info, ctxt) != SI_USER && (t->flags & SigPanic)) {
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp->sig = sig;
+		gp->sigcode0 = SIG_CODE0(info, ctxt);
+		gp->sigcode1 = SIG_CODE1(info, ctxt);
+		gp->sigpc = SIG_RIP(info, ctxt);
+
+#ifdef GOOS_darwin
+		// Work around Leopard bug that doesn't set FPE_INTDIV.
+		// Look at instruction to see if it is a divide.
+		// Not necessary in Snow Leopard (si_code will be != 0).
+		if(sig == SIGFPE && gp->sigcode0 == 0) {
+			byte *pc;
+			pc = (byte*)gp->sigpc;
+			if((pc[0]&0xF0) == 0x40)	// 64-bit REX prefix
+				pc++;
+			else if(pc[0] == 0x66)	// 16-bit instruction prefix
+				pc++;
+			if(pc[0] == 0xF6 || pc[0] == 0xF7)
+				gp->sigcode0 = FPE_INTDIV;
+		}
+#endif
+
+		// Only push runtime·sigpanic if rip != 0.
+		// If rip == 0, probably panicked because of a
+		// call to a nil func.  Not pushing that onto sp will
+		// make the trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic and we
+		// won't get to see who faulted.)
+		if(SIG_RIP(info, ctxt) != 0) {
+			sp = (uintptr*)SIG_RSP(info, ctxt);
+			if(sizeof(uintreg) > sizeof(uintptr))
+				*--sp = 0;
+			*--sp = SIG_RIP(info, ctxt);
+			SIG_RSP(info, ctxt) = (uintptr)sp;
+		}
+		SIG_RIP(info, ctxt) = (uintptr)runtime·sigpanic;
+		return;
+	}
+
+	if(SIG_CODE0(info, ctxt) == SI_USER || (t->flags & SigNotify))
+		if(runtime·sigsend(sig))
+			return;
+	if(t->flags & SigKill)
+		runtime·exit(2);
+	if(!(t->flags & SigThrow))
+		return;
+
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
+	runtime·startpanic();
+
+	if(sig < 0 || sig >= NSIG)
+		runtime·printf("Signal %d\n", sig);
+	else
+		runtime·printf("%s\n", runtime·sigtab[sig].name);
+
+	runtime·printf("PC=%X\n", SIG_RIP(info, ctxt));
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = g->m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)){
+		runtime·goroutineheader(gp);
+		runtime·tracebacktrap(SIG_RIP(info, ctxt), SIG_RSP(info, ctxt), 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(info, ctxt);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+	runtime·exit(2);
+}
diff --git a/src/runtime/signal_android_386.h b/src/runtime/signal_android_386.h
new file mode 100644
index 000000000..2a1bb4b3e
--- /dev/null
+++ b/src/runtime/signal_android_386.h
@@ -0,0 +1 @@
+#include "signal_linux_386.h"
diff --git a/src/runtime/signal_android_arm.h b/src/runtime/signal_android_arm.h
new file mode 100644
index 000000000..8a05e21e5
--- /dev/null
+++ b/src/runtime/signal_android_arm.h
@@ -0,0 +1 @@
+#include "signal_linux_arm.h"
diff --git a/src/runtime/signal_arm.c b/src/runtime/signal_arm.c
new file mode 100644
index 000000000..afad5e7d1
--- /dev/null
+++ b/src/runtime/signal_arm.c
@@ -0,0 +1,121 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_GOOS_GOARCH.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Siginfo *info, void *ctxt)
+{
+	USED(info);
+	USED(ctxt);
+
+	runtime·printf("trap    %x\n", SIG_TRAP(info, ctxt));
+	runtime·printf("error   %x\n", SIG_ERROR(info, ctxt));
+	runtime·printf("oldmask %x\n", SIG_OLDMASK(info, ctxt));
+	runtime·printf("r0      %x\n", SIG_R0(info, ctxt));
+	runtime·printf("r1      %x\n", SIG_R1(info, ctxt));
+	runtime·printf("r2      %x\n", SIG_R2(info, ctxt));
+	runtime·printf("r3      %x\n", SIG_R3(info, ctxt));
+	runtime·printf("r4      %x\n", SIG_R4(info, ctxt));
+	runtime·printf("r5      %x\n", SIG_R5(info, ctxt));
+	runtime·printf("r6      %x\n", SIG_R6(info, ctxt));
+	runtime·printf("r7      %x\n", SIG_R7(info, ctxt));
+	runtime·printf("r8      %x\n", SIG_R8(info, ctxt));
+	runtime·printf("r9      %x\n", SIG_R9(info, ctxt));
+	runtime·printf("r10     %x\n", SIG_R10(info, ctxt));
+	runtime·printf("fp      %x\n", SIG_FP(info, ctxt));
+	runtime·printf("ip      %x\n", SIG_IP(info, ctxt));
+	runtime·printf("sp      %x\n", SIG_SP(info, ctxt));
+	runtime·printf("lr      %x\n", SIG_LR(info, ctxt));
+	runtime·printf("pc      %x\n", SIG_PC(info, ctxt));
+	runtime·printf("cpsr    %x\n", SIG_CPSR(info, ctxt));
+	runtime·printf("fault   %x\n", SIG_FAULT(info, ctxt));
+}
+
+void
+runtime·sighandler(int32 sig, Siginfo *info, void *ctxt, G *gp)
+{
+	SigTab *t;
+	bool crash;
+
+	if(sig == SIGPROF) {
+		runtime·sigprof((uint8*)SIG_PC(info, ctxt), (uint8*)SIG_SP(info, ctxt), (uint8*)SIG_LR(info, ctxt), gp, g->m);
+		return;
+	}
+
+	t = &runtime·sigtab[sig];
+	if(SIG_CODE0(info, ctxt) != SI_USER && (t->flags & SigPanic)) {
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp->sig = sig;
+		gp->sigcode0 = SIG_CODE0(info, ctxt);
+		gp->sigcode1 = SIG_FAULT(info, ctxt);
+		gp->sigpc = SIG_PC(info, ctxt);
+
+		// We arrange lr, and pc to pretend the panicking
+		// function calls sigpanic directly.
+		// Always save LR to stack so that panics in leaf
+		// functions are correctly handled. This smashes
+		// the stack frame but we're not going back there
+		// anyway.
+		SIG_SP(info, ctxt) -= 4;
+		*(uint32*)SIG_SP(info, ctxt) = SIG_LR(info, ctxt);
+		// Don't bother saving PC if it's zero, which is
+		// probably a call to a nil func: the old link register
+		// is more useful in the stack trace.
+		if(gp->sigpc != 0)
+			SIG_LR(info, ctxt) = gp->sigpc;
+		// In case we are panicking from external C code
+		SIG_R10(info, ctxt) = (uintptr)gp;
+		SIG_PC(info, ctxt) = (uintptr)runtime·sigpanic;
+		return;
+	}
+
+	if(SIG_CODE0(info, ctxt) == SI_USER || (t->flags & SigNotify))
+		if(runtime·sigsend(sig))
+			return;
+	if(t->flags & SigKill)
+		runtime·exit(2);
+	if(!(t->flags & SigThrow))
+		return;
+
+	g->m->throwing = 1;
+	g->m->caughtsig = gp;
+	if(runtime·panicking)	// traceback already printed
+		runtime·exit(2);
+	runtime·panicking = 1;
+
+	if(sig < 0 || sig >= NSIG)
+		runtime·printf("Signal %d\n", sig);
+	else
+		runtime·printf("%s\n", runtime·sigtab[sig].name);
+
+	runtime·printf("PC=%x\n", SIG_PC(info, ctxt));
+	if(g->m->lockedg != nil && g->m->ncgo > 0 && gp == g->m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = g->m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback(&crash)){
+		runtime·goroutineheader(gp);
+		runtime·tracebacktrap(SIG_PC(info, ctxt), SIG_SP(info, ctxt), SIG_LR(info, ctxt), gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(info, ctxt);
+	}
+	
+	if(crash)
+		runtime·crash();
+
+	runtime·exit(2);
+}
diff --git a/src/runtime/signal_darwin_386.h b/src/runtime/signal_darwin_386.h
new file mode 100644
index 000000000..5459e10a1
--- /dev/null
+++ b/src/runtime/signal_darwin_386.h
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext->ss)
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).eax)
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).ebx)
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).ecx)
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).edx)
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).edi)
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).esi)
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).ebp)
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).esp)
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).eip)
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).eflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) ((uintptr)(info)->si_addr)
diff --git a/src/runtime/signal_darwin_amd64.h b/src/runtime/signal_darwin_amd64.h
new file mode 100644
index 000000000..e3da6de3a
--- /dev/null
+++ b/src/runtime/signal_darwin_amd64.h
@@ -0,0 +1,31 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext->ss)
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).rax)
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).rbx)
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).rcx)
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).rdx)
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).rdi)
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).rsi)
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).rbp)
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).rsp)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).r10)
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).r11)
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).r12)
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).r13)
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).r14)
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).r15)
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).rip)
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).rflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) ((uintptr)(info)->si_addr)
diff --git a/src/runtime/signal_dragonfly_386.h b/src/runtime/signal_dragonfly_386.h
new file mode 100644
index 000000000..a24f1ee96
--- /dev/null
+++ b/src/runtime/signal_dragonfly_386.h
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext)
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).mc_eax)
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).mc_ebx)
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).mc_ecx)
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).mc_edx)
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).mc_edi)
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).mc_esi)
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).mc_ebp)
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).mc_esp)
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).mc_eip)
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).mc_eflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).mc_cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).mc_fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).mc_gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) ((uintptr)(info)->si_addr)
diff --git a/src/runtime/signal_dragonfly_amd64.h b/src/runtime/signal_dragonfly_amd64.h
new file mode 100644
index 000000000..5b4f97782
--- /dev/null
+++ b/src/runtime/signal_dragonfly_amd64.h
@@ -0,0 +1,31 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext)
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).mc_rax)
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).mc_rbx)
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).mc_rcx)
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).mc_rdx)
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).mc_rdi)
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).mc_rsi)
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).mc_rbp)
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).mc_rsp)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).mc_r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).mc_r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).mc_r10)
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).mc_r11)
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).mc_r12)
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).mc_r13)
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).mc_r14)
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).mc_r15)
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).mc_rip)
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).mc_rflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).mc_cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).mc_ss)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).mc_ss)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) ((uintptr)(info)->si_addr)
diff --git a/src/runtime/signal_freebsd_386.h b/src/runtime/signal_freebsd_386.h
new file mode 100644
index 000000000..a24f1ee96
--- /dev/null
+++ b/src/runtime/signal_freebsd_386.h
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext)
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).mc_eax)
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).mc_ebx)
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).mc_ecx)
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).mc_edx)
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).mc_edi)
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).mc_esi)
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).mc_ebp)
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).mc_esp)
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).mc_eip)
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).mc_eflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).mc_cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).mc_fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).mc_gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) ((uintptr)(info)->si_addr)
diff --git a/src/runtime/signal_freebsd_amd64.h b/src/runtime/signal_freebsd_amd64.h
new file mode 100644
index 000000000..7d35b7f85
--- /dev/null
+++ b/src/runtime/signal_freebsd_amd64.h
@@ -0,0 +1,31 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext)
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).mc_rax)
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).mc_rbx)
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).mc_rcx)
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).mc_rdx)
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).mc_rdi)
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).mc_rsi)
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).mc_rbp)
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).mc_rsp)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).mc_r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).mc_r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).mc_r10)
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).mc_r11)
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).mc_r12)
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).mc_r13)
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).mc_r14)
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).mc_r15)
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).mc_rip)
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).mc_rflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).mc_cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).mc_fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).mc_gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) ((uintptr)(info)->si_addr)
diff --git a/src/runtime/signal_freebsd_arm.h b/src/runtime/signal_freebsd_arm.h
new file mode 100644
index 000000000..87a45aa27
--- /dev/null
+++ b/src/runtime/signal_freebsd_arm.h
@@ -0,0 +1,28 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext)
+
+#define SIG_R0(info, ctxt) (SIG_REGS(ctxt).__gregs[0])
+#define SIG_R1(info, ctxt) (SIG_REGS(ctxt).__gregs[1])
+#define SIG_R2(info, ctxt) (SIG_REGS(ctxt).__gregs[2])
+#define SIG_R3(info, ctxt) (SIG_REGS(ctxt).__gregs[3])
+#define SIG_R4(info, ctxt) (SIG_REGS(ctxt).__gregs[4])
+#define SIG_R5(info, ctxt) (SIG_REGS(ctxt).__gregs[5])
+#define SIG_R6(info, ctxt) (SIG_REGS(ctxt).__gregs[6])
+#define SIG_R7(info, ctxt) (SIG_REGS(ctxt).__gregs[7])
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).__gregs[8])
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).__gregs[9])
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).__gregs[10])
+#define SIG_FP(info, ctxt) (SIG_REGS(ctxt).__gregs[11])
+#define SIG_IP(info, ctxt) (SIG_REGS(ctxt).__gregs[12])
+#define SIG_SP(info, ctxt) (SIG_REGS(ctxt).__gregs[13])
+#define SIG_LR(info, ctxt) (SIG_REGS(ctxt).__gregs[14])
+#define SIG_PC(info, ctxt) (SIG_REGS(ctxt).__gregs[15])
+#define SIG_CPSR(info, ctxt) (SIG_REGS(ctxt).__gregs[16])
+#define SIG_FAULT(info, ctxt) ((uintptr)(info)->si_addr)
+#define SIG_TRAP(info, ctxt) (0)
+#define SIG_ERROR(info, ctxt) (0)
+#define SIG_OLDMASK(info, ctxt) (0)
+#define SIG_CODE0(info, ctxt) ((uintptr)(info)->si_code)
diff --git a/src/runtime/signal_linux_386.h b/src/runtime/signal_linux_386.h
new file mode 100644
index 000000000..f77f1c9d5
--- /dev/null
+++ b/src/runtime/signal_linux_386.h
@@ -0,0 +1,24 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (*((Sigcontext*)&((Ucontext*)(ctxt))->uc_mcontext))
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).eax)
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).ebx)
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).ecx)
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).edx)
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).edi)
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).esi)
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).ebp)
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).esp)
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).eip)
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).eflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) (((uintptr*)(info))[2])
+
diff --git a/src/runtime/signal_linux_amd64.h b/src/runtime/signal_linux_amd64.h
new file mode 100644
index 000000000..5a9a3e5da
--- /dev/null
+++ b/src/runtime/signal_linux_amd64.h
@@ -0,0 +1,32 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (*((Sigcontext*)&((Ucontext*)(ctxt))->uc_mcontext))
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).rax)
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).rbx)
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).rcx)
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).rdx)
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).rdi)
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).rsi)
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).rbp)
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).rsp)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).r10)
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).r11)
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).r12)
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).r13)
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).r14)
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).r15)
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).rip)
+#define SIG_RFLAGS(info, ctxt) ((uint64)SIG_REGS(ctxt).eflags)
+
+#define SIG_CS(info, ctxt) ((uint64)SIG_REGS(ctxt).cs)
+#define SIG_FS(info, ctxt) ((uint64)SIG_REGS(ctxt).fs)
+#define SIG_GS(info, ctxt) ((uint64)SIG_REGS(ctxt).gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) (((uintptr*)(info))[2])
+
diff --git a/src/runtime/signal_linux_arm.h b/src/runtime/signal_linux_arm.h
new file mode 100644
index 000000000..a674c0d57
--- /dev/null
+++ b/src/runtime/signal_linux_arm.h
@@ -0,0 +1,28 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (*((Sigcontext*)&((Ucontext*)(ctxt))->uc_mcontext))
+
+#define SIG_R0(info, ctxt) (SIG_REGS(ctxt).arm_r0)
+#define SIG_R1(info, ctxt) (SIG_REGS(ctxt).arm_r1)
+#define SIG_R2(info, ctxt) (SIG_REGS(ctxt).arm_r2)
+#define SIG_R3(info, ctxt) (SIG_REGS(ctxt).arm_r3)
+#define SIG_R4(info, ctxt) (SIG_REGS(ctxt).arm_r4)
+#define SIG_R5(info, ctxt) (SIG_REGS(ctxt).arm_r5)
+#define SIG_R6(info, ctxt) (SIG_REGS(ctxt).arm_r6)
+#define SIG_R7(info, ctxt) (SIG_REGS(ctxt).arm_r7)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).arm_r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).arm_r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).arm_r10)
+#define SIG_FP(info, ctxt) (SIG_REGS(ctxt).arm_fp)
+#define SIG_IP(info, ctxt) (SIG_REGS(ctxt).arm_ip)
+#define SIG_SP(info, ctxt) (SIG_REGS(ctxt).arm_sp)
+#define SIG_LR(info, ctxt) (SIG_REGS(ctxt).arm_lr)
+#define SIG_PC(info, ctxt) (SIG_REGS(ctxt).arm_pc)
+#define SIG_CPSR(info, ctxt) (SIG_REGS(ctxt).arm_cpsr)
+#define SIG_FAULT(info, ctxt) (SIG_REGS(ctxt).fault_address)
+#define SIG_TRAP(info, ctxt) (SIG_REGS(ctxt).trap_no)
+#define SIG_ERROR(info, ctxt) (SIG_REGS(ctxt).error_code)
+#define SIG_OLDMASK(info, ctxt) (SIG_REGS(ctxt).oldmask)
+#define SIG_CODE0(info, ctxt) ((uintptr)(info)->si_code)
diff --git a/src/runtime/signal_nacl_386.h b/src/runtime/signal_nacl_386.h
new file mode 100644
index 000000000..c9481b5f4
--- /dev/null
+++ b/src/runtime/signal_nacl_386.h
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((ExcContext*)(ctxt))->regs)
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).eax)
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).ebx)
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).ecx)
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).edx)
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).edi)
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).esi)
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).ebp)
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).esp)
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).eip)
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).eflags)
+
+#define SIG_CS(info, ctxt) (~0)
+#define SIG_FS(info, ctxt) (~0)
+#define SIG_GS(info, ctxt) (~0)
+
+#define SIG_CODE0(info, ctxt) (~0)
+#define SIG_CODE1(info, ctxt) (0)
diff --git a/src/runtime/signal_nacl_amd64p32.h b/src/runtime/signal_nacl_amd64p32.h
new file mode 100644
index 000000000..f62305cb5
--- /dev/null
+++ b/src/runtime/signal_nacl_amd64p32.h
@@ -0,0 +1,31 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((ExcContext*)(ctxt))->regs.regs64)
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).rax)
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).rbx)
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).rcx)
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).rdx)
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).rdi)
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).rsi)
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).rbp)
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).rsp)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).r10)
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).r11)
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).r12)
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).r13)
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).r14)
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).r15)
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).rip)
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).rflags)
+
+#define SIG_CS(info, ctxt) (~0)
+#define SIG_FS(info, ctxt) (~0)
+#define SIG_GS(info, ctxt) (~0)
+
+#define SIG_CODE0(info, ctxt) (~0)
+#define SIG_CODE1(info, ctxt) (0)
diff --git a/src/runtime/signal_nacl_arm.h b/src/runtime/signal_nacl_arm.h
new file mode 100644
index 000000000..e5bbb211d
--- /dev/null
+++ b/src/runtime/signal_nacl_arm.h
@@ -0,0 +1,28 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((ExcContext*)(ctxt))->regs)
+
+#define SIG_R0(info, ctxt) (SIG_REGS(ctxt).r0)
+#define SIG_R1(info, ctxt) (SIG_REGS(ctxt).r1)
+#define SIG_R2(info, ctxt) (SIG_REGS(ctxt).r2)
+#define SIG_R3(info, ctxt) (SIG_REGS(ctxt).r3)
+#define SIG_R4(info, ctxt) (SIG_REGS(ctxt).r4)
+#define SIG_R5(info, ctxt) (SIG_REGS(ctxt).r5)
+#define SIG_R6(info, ctxt) (SIG_REGS(ctxt).r6)
+#define SIG_R7(info, ctxt) (SIG_REGS(ctxt).r7)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).r10)
+#define SIG_FP(info, ctxt) (SIG_REGS(ctxt).r11)
+#define SIG_IP(info, ctxt) (SIG_REGS(ctxt).r12)
+#define SIG_SP(info, ctxt) (SIG_REGS(ctxt).sp)
+#define SIG_LR(info, ctxt) (SIG_REGS(ctxt).lr)
+#define SIG_PC(info, ctxt) (SIG_REGS(ctxt).pc)
+#define SIG_CPSR(info, ctxt) (SIG_REGS(ctxt).cpsr)
+#define SIG_FAULT(info, ctxt) (~0)
+#define SIG_TRAP(info, ctxt) (~0)
+#define SIG_ERROR(info, ctxt) (~0)
+#define SIG_OLDMASK(info, ctxt) (~0)
+#define SIG_CODE0(info, ctxt) (~0)
diff --git a/src/runtime/signal_netbsd_386.h b/src/runtime/signal_netbsd_386.h
new file mode 100644
index 000000000..d5a8a0c4b
--- /dev/null
+++ b/src/runtime/signal_netbsd_386.h
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((UcontextT*)(ctxt))->uc_mcontext)
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EAX])
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EBX])
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_ECX])
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EDX])
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EDI])
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_ESI])
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EBP])
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_UESP])
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EIP])
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_EFL])
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_CS])
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_FS])
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_GS])
+
+#define SIG_CODE0(info, ctxt) ((info)->_code)
+#define SIG_CODE1(info, ctxt) (*(uintptr*)&(info)->_reason[0])
diff --git a/src/runtime/signal_netbsd_amd64.h b/src/runtime/signal_netbsd_amd64.h
new file mode 100644
index 000000000..7ec4cd98c
--- /dev/null
+++ b/src/runtime/signal_netbsd_amd64.h
@@ -0,0 +1,31 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((UcontextT*)(ctxt))->uc_mcontext)
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RAX])
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RBX])
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RCX])
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RDX])
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RDI])
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RSI])
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RBP])
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RSP])
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R8])
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R9])
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R10])
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R11])
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R12])
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R13])
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R14])
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R15])
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RIP])
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_RFLAGS])
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_CS])
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_FS])
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_GS])
+
+#define SIG_CODE0(info, ctxt) ((info)->_code)
+#define SIG_CODE1(info, ctxt) (*(uintptr*)&(info)->_reason[0])
diff --git a/src/runtime/signal_netbsd_arm.h b/src/runtime/signal_netbsd_arm.h
new file mode 100644
index 000000000..12f5827a6
--- /dev/null
+++ b/src/runtime/signal_netbsd_arm.h
@@ -0,0 +1,30 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((UcontextT*)(ctxt))->uc_mcontext)
+
+#define SIG_R0(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R0])
+#define SIG_R1(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R1])
+#define SIG_R2(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R2])
+#define SIG_R3(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R3])
+#define SIG_R4(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R4])
+#define SIG_R5(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R5])
+#define SIG_R6(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R6])
+#define SIG_R7(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R7])
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R8])
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R9])
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R10])
+#define SIG_FP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R11])
+#define SIG_IP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R12])
+#define SIG_SP(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R13])
+#define SIG_LR(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R14])
+#define SIG_PC(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_R15])
+#define SIG_CPSR(info, ctxt) (SIG_REGS(ctxt).__gregs[REG_CPSR])
+#define SIG_FAULT(info, ctxt) (*(uintptr*)&(info)->_reason[0])
+#define SIG_TRAP(info, ctxt) (0)
+#define SIG_ERROR(info, ctxt) (0)
+#define SIG_OLDMASK(info, ctxt) (0)
+
+#define SIG_CODE0(info, ctxt) ((info)->_code)
+#define SIG_CODE1(info, ctxt) (*(uintptr*)&(info)->_reason[0])
diff --git a/src/runtime/signal_openbsd_386.h b/src/runtime/signal_openbsd_386.h
new file mode 100644
index 000000000..6742db8d4
--- /dev/null
+++ b/src/runtime/signal_openbsd_386.h
@@ -0,0 +1,23 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (*(Sigcontext*)(ctxt))
+
+#define SIG_EAX(info, ctxt) (SIG_REGS(ctxt).sc_eax)
+#define SIG_EBX(info, ctxt) (SIG_REGS(ctxt).sc_ebx)
+#define SIG_ECX(info, ctxt) (SIG_REGS(ctxt).sc_ecx)
+#define SIG_EDX(info, ctxt) (SIG_REGS(ctxt).sc_edx)
+#define SIG_EDI(info, ctxt) (SIG_REGS(ctxt).sc_edi)
+#define SIG_ESI(info, ctxt) (SIG_REGS(ctxt).sc_esi)
+#define SIG_EBP(info, ctxt) (SIG_REGS(ctxt).sc_ebp)
+#define SIG_ESP(info, ctxt) (SIG_REGS(ctxt).sc_esp)
+#define SIG_EIP(info, ctxt) (SIG_REGS(ctxt).sc_eip)
+#define SIG_EFLAGS(info, ctxt) (SIG_REGS(ctxt).sc_eflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).sc_cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).sc_fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).sc_gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) (*(uintptr*)((byte*)info + 12))
diff --git a/src/runtime/signal_openbsd_amd64.h b/src/runtime/signal_openbsd_amd64.h
new file mode 100644
index 000000000..b46a5dfa6
--- /dev/null
+++ b/src/runtime/signal_openbsd_amd64.h
@@ -0,0 +1,31 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (*(Sigcontext*)(ctxt))
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).sc_rax)
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).sc_rbx)
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).sc_rcx)
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).sc_rdx)
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).sc_rdi)
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).sc_rsi)
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).sc_rbp)
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).sc_rsp)
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).sc_r8)
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).sc_r9)
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).sc_r10)
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).sc_r11)
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).sc_r12)
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).sc_r13)
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).sc_r14)
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).sc_r15)
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).sc_rip)
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).sc_rflags)
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).sc_cs)
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).sc_fs)
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).sc_gs)
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) (*(uintptr*)((byte*)(info) + 16))
diff --git a/src/runtime/signal_solaris_amd64.h b/src/runtime/signal_solaris_amd64.h
new file mode 100644
index 000000000..c2e0a1549
--- /dev/null
+++ b/src/runtime/signal_solaris_amd64.h
@@ -0,0 +1,31 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_REGS(ctxt) (((Ucontext*)(ctxt))->uc_mcontext)
+
+#define SIG_RAX(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RAX])
+#define SIG_RBX(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RBX])
+#define SIG_RCX(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RCX])
+#define SIG_RDX(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RDX])
+#define SIG_RDI(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RDI])
+#define SIG_RSI(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RSI])
+#define SIG_RBP(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RBP])
+#define SIG_RSP(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RSP])
+#define SIG_R8(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R8])
+#define SIG_R9(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R9])
+#define SIG_R10(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R10])
+#define SIG_R11(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R11])
+#define SIG_R12(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R12])
+#define SIG_R13(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R13])
+#define SIG_R14(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R14])
+#define SIG_R15(info, ctxt) (SIG_REGS(ctxt).gregs[REG_R15])
+#define SIG_RIP(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RIP])
+#define SIG_RFLAGS(info, ctxt) (SIG_REGS(ctxt).gregs[REG_RFLAGS])
+
+#define SIG_CS(info, ctxt) (SIG_REGS(ctxt).gregs[REG_CS])
+#define SIG_FS(info, ctxt) (SIG_REGS(ctxt).gregs[REG_FS])
+#define SIG_GS(info, ctxt) (SIG_REGS(ctxt).gregs[REG_GS])
+
+#define SIG_CODE0(info, ctxt) ((info)->si_code)
+#define SIG_CODE1(info, ctxt) (*(uintptr*)&(info)->__data[0])
diff --git a/src/runtime/signal_unix.c b/src/runtime/signal_unix.c
new file mode 100644
index 000000000..0e33ece49
--- /dev/null
+++ b/src/runtime/signal_unix.c
@@ -0,0 +1,119 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signal_unix.h"
+
+extern SigTab runtime·sigtab[];
+
+void
+runtime·initsig(void)
+{
+	int32 i;
+	SigTab *t;
+
+	// First call: basic setup.
+	for(i = 0; i<NSIG; i++) {
+		t = &runtime·sigtab[i];
+		if((t->flags == 0) || (t->flags & SigDefault))
+			continue;
+
+		// For some signals, we respect an inherited SIG_IGN handler
+		// rather than insist on installing our own default handler.
+		// Even these signals can be fetched using the os/signal package.
+		switch(i) {
+		case SIGHUP:
+		case SIGINT:
+			if(runtime·getsig(i) == SIG_IGN) {
+				t->flags = SigNotify | SigIgnored;
+				continue;
+			}
+		}
+
+		t->flags |= SigHandling;
+		runtime·setsig(i, runtime·sighandler, true);
+	}
+}
+
+void
+runtime·sigenable(uint32 sig)
+{
+	SigTab *t;
+
+	if(sig >= NSIG)
+		return;
+
+	t = &runtime·sigtab[sig];
+	if((t->flags & SigNotify) && !(t->flags & SigHandling)) {
+		t->flags |= SigHandling;
+		if(runtime·getsig(sig) == SIG_IGN)
+			t->flags |= SigIgnored;
+		runtime·setsig(sig, runtime·sighandler, true);
+	}
+}
+
+void
+runtime·sigdisable(uint32 sig)
+{
+	SigTab *t;
+
+	if(sig >= NSIG)
+		return;
+
+	t = &runtime·sigtab[sig];
+	if((t->flags & SigNotify) && (t->flags & SigHandling)) {
+		t->flags &= ~SigHandling;
+		if(t->flags & SigIgnored)
+			runtime·setsig(sig, SIG_IGN, true);
+		else
+			runtime·setsig(sig, SIG_DFL, true);
+	}
+}
+
+void
+runtime·resetcpuprofiler(int32 hz)
+{
+	Itimerval it;
+
+	runtime·memclr((byte*)&it, sizeof it);
+	if(hz == 0) {
+		runtime·setitimer(ITIMER_PROF, &it, nil);
+	} else {
+		it.it_interval.tv_sec = 0;
+		it.it_interval.tv_usec = 1000000 / hz;
+		it.it_value = it.it_interval;
+		runtime·setitimer(ITIMER_PROF, &it, nil);
+	}
+	g->m->profilehz = hz;
+}
+
+void
+runtime·sigpipe(void)
+{
+	runtime·setsig(SIGPIPE, SIG_DFL, false);
+	runtime·raise(SIGPIPE);
+}
+
+void
+runtime·crash(void)
+{
+#ifdef GOOS_darwin
+	// OS X core dumps are linear dumps of the mapped memory,
+	// from the first virtual byte to the last, with zeros in the gaps.
+	// Because of the way we arrange the address space on 64-bit systems,
+	// this means the OS X core file will be >128 GB and even on a zippy
+	// workstation can take OS X well over an hour to write (uninterruptible).
+	// Save users from making that mistake.
+	if(sizeof(void*) == 8)
+		return;
+#endif
+
+	runtime·unblocksignals();
+	runtime·setsig(SIGABRT, SIG_DFL, false);
+	runtime·raise(SIGABRT);
+}
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
new file mode 100644
index 000000000..ba77b6e7b
--- /dev/null
+++ b/src/runtime/signal_unix.go
@@ -0,0 +1,13 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func sigpipe()
+
+func os_sigpipe() {
+	onM(sigpipe)
+}
diff --git a/src/runtime/signal_unix.h b/src/runtime/signal_unix.h
new file mode 100644
index 000000000..2d84a0186
--- /dev/null
+++ b/src/runtime/signal_unix.h
@@ -0,0 +1,14 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define SIG_DFL ((void*)0)
+#define SIG_IGN ((void*)1)
+
+typedef void GoSighandler(int32, Siginfo*, void*, G*);
+void	runtime·setsig(int32, GoSighandler*, bool);
+GoSighandler* runtime·getsig(int32);
+
+void	runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp);
+void	runtime·raise(int32);
+
diff --git a/src/runtime/signals_android.h b/src/runtime/signals_android.h
new file mode 100644
index 000000000..5140d8a18
--- /dev/null
+++ b/src/runtime/signals_android.h
@@ -0,0 +1 @@
+#include "signals_linux.h"
diff --git a/src/runtime/signals_darwin.h b/src/runtime/signals_darwin.h
new file mode 100644
index 000000000..8761e1bd9
--- /dev/null
+++ b/src/runtime/signals_darwin.h
@@ -0,0 +1,53 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/* 0 */	0, "SIGNONE: no trap",
+	/* 1 */	N+K, "SIGHUP: terminal line hangup",
+	/* 2 */	N+K, "SIGINT: interrupt",
+	/* 3 */	N+T, "SIGQUIT: quit",
+	/* 4 */	T, "SIGILL: illegal instruction",
+	/* 5 */	T, "SIGTRAP: trace trap",
+	/* 6 */	N+T, "SIGABRT: abort",
+	/* 7 */	T, "SIGEMT: emulate instruction executed",
+	/* 8 */	P, "SIGFPE: floating-point exception",
+	/* 9 */	0, "SIGKILL: kill",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	T, "SIGSYS: bad system call",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	N, "SIGURG: urgent condition on socket",
+	/* 17 */	0, "SIGSTOP: stop",
+	/* 18 */	N+D, "SIGTSTP: keyboard stop",
+	/* 19 */	0, "SIGCONT: continue after stop",
+	/* 20 */	N, "SIGCHLD: child status has changed",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGIO: i/o now possible",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGINFO: status request from keyboard",
+	/* 30 */	N, "SIGUSR1: user-defined signal 1",
+	/* 31 */	N, "SIGUSR2: user-defined signal 2",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_dragonfly.h b/src/runtime/signals_dragonfly.h
new file mode 100644
index 000000000..07343a766
--- /dev/null
+++ b/src/runtime/signals_dragonfly.h
@@ -0,0 +1,54 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/* 0 */	0, "SIGNONE: no trap",
+	/* 1 */	N+K, "SIGHUP: terminal line hangup",
+	/* 2 */	N+K, "SIGINT: interrupt",
+	/* 3 */	N+T, "SIGQUIT: quit",
+	/* 4 */	T, "SIGILL: illegal instruction",
+	/* 5 */	T, "SIGTRAP: trace trap",
+	/* 6 */	N+T, "SIGABRT: abort",
+	/* 7 */	T, "SIGEMT: emulate instruction executed",
+	/* 8 */	P, "SIGFPE: floating-point exception",
+	/* 9 */	0, "SIGKILL: kill",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	T, "SIGSYS: bad system call",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	N, "SIGURG: urgent condition on socket",
+	/* 17 */	0, "SIGSTOP: stop",
+	/* 18 */	N+D, "SIGTSTP: keyboard stop",
+	/* 19 */	0, "SIGCONT: continue after stop",
+	/* 20 */	N, "SIGCHLD: child status has changed",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGIO: i/o now possible",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGINFO: status request from keyboard",
+	/* 30 */	N, "SIGUSR1: user-defined signal 1",
+	/* 31 */	N, "SIGUSR2: user-defined signal 2",
+	/* 32 */	N, "SIGTHR: reserved",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_freebsd.h b/src/runtime/signals_freebsd.h
new file mode 100644
index 000000000..39e0a947e
--- /dev/null
+++ b/src/runtime/signals_freebsd.h
@@ -0,0 +1,54 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/* 0 */	0, "SIGNONE: no trap",
+	/* 1 */	N+K, "SIGHUP: terminal line hangup",
+	/* 2 */	N+K, "SIGINT: interrupt",
+	/* 3 */	N+T, "SIGQUIT: quit",
+	/* 4 */	T, "SIGILL: illegal instruction",
+	/* 5 */	T, "SIGTRAP: trace trap",
+	/* 6 */	N+T, "SIGABRT: abort",
+	/* 7 */	T, "SIGEMT: emulate instruction executed",
+	/* 8 */	P, "SIGFPE: floating-point exception",
+	/* 9 */	0, "SIGKILL: kill",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	N, "SIGSYS: bad system call",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	N, "SIGURG: urgent condition on socket",
+	/* 17 */	0, "SIGSTOP: stop",
+	/* 18 */	N+D, "SIGTSTP: keyboard stop",
+	/* 19 */	0, "SIGCONT: continue after stop",
+	/* 20 */	N, "SIGCHLD: child status has changed",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGIO: i/o now possible",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGINFO: status request from keyboard",
+	/* 30 */	N, "SIGUSR1: user-defined signal 1",
+	/* 31 */	N, "SIGUSR2: user-defined signal 2",
+	/* 32 */	N, "SIGTHR: reserved",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_linux.h b/src/runtime/signals_linux.h
new file mode 100644
index 000000000..374107609
--- /dev/null
+++ b/src/runtime/signals_linux.h
@@ -0,0 +1,86 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/* 0 */	0, "SIGNONE: no trap",
+	/* 1 */	N+K, "SIGHUP: terminal line hangup",
+	/* 2 */	N+K, "SIGINT: interrupt",
+	/* 3 */	N+T, "SIGQUIT: quit",
+	/* 4 */	T, "SIGILL: illegal instruction",
+	/* 5 */	T, "SIGTRAP: trace trap",
+	/* 6 */	N+T, "SIGABRT: abort",
+	/* 7 */	P, "SIGBUS: bus error",
+	/* 8 */	P, "SIGFPE: floating-point exception",
+	/* 9 */	0, "SIGKILL: kill",
+	/* 10 */	N, "SIGUSR1: user-defined signal 1",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	N, "SIGUSR2: user-defined signal 2",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	T, "SIGSTKFLT: stack fault",
+	/* 17 */	N, "SIGCHLD: child status has changed",
+	/* 18 */	0, "SIGCONT: continue",
+	/* 19 */	0, "SIGSTOP: stop, unblockable",
+	/* 20 */	N+D, "SIGTSTP: keyboard stop",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGURG: urgent condition on socket",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGIO: i/o now possible",
+	/* 30 */	N, "SIGPWR: power failure restart",
+	/* 31 */	N, "SIGSYS: bad system call",
+	/* 32 */	0, "signal 32", /* SIGCANCEL; see issue 6997 */
+	/* 33 */	0, "signal 33", /* SIGSETXID; see issue 3871 */
+	/* 34 */	N, "signal 34",
+	/* 35 */	N, "signal 35",
+	/* 36 */	N, "signal 36",
+	/* 37 */	N, "signal 37",
+	/* 38 */	N, "signal 38",
+	/* 39 */	N, "signal 39",
+	/* 40 */	N, "signal 40",
+	/* 41 */	N, "signal 41",
+	/* 42 */	N, "signal 42",
+	/* 43 */	N, "signal 43",
+	/* 44 */	N, "signal 44",
+	/* 45 */	N, "signal 45",
+	/* 46 */	N, "signal 46",
+	/* 47 */	N, "signal 47",
+	/* 48 */	N, "signal 48",
+	/* 49 */	N, "signal 49",
+	/* 50 */	N, "signal 50",
+	/* 51 */	N, "signal 51",
+	/* 52 */	N, "signal 52",
+	/* 53 */	N, "signal 53",
+	/* 54 */	N, "signal 54",
+	/* 55 */	N, "signal 55",
+	/* 56 */	N, "signal 56",
+	/* 57 */	N, "signal 57",
+	/* 58 */	N, "signal 58",
+	/* 59 */	N, "signal 59",
+	/* 60 */	N, "signal 60",
+	/* 61 */	N, "signal 61",
+	/* 62 */	N, "signal 62",
+	/* 63 */	N, "signal 63",
+	/* 64 */	N, "signal 64",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_nacl.h b/src/runtime/signals_nacl.h
new file mode 100644
index 000000000..8761e1bd9
--- /dev/null
+++ b/src/runtime/signals_nacl.h
@@ -0,0 +1,53 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/* 0 */	0, "SIGNONE: no trap",
+	/* 1 */	N+K, "SIGHUP: terminal line hangup",
+	/* 2 */	N+K, "SIGINT: interrupt",
+	/* 3 */	N+T, "SIGQUIT: quit",
+	/* 4 */	T, "SIGILL: illegal instruction",
+	/* 5 */	T, "SIGTRAP: trace trap",
+	/* 6 */	N+T, "SIGABRT: abort",
+	/* 7 */	T, "SIGEMT: emulate instruction executed",
+	/* 8 */	P, "SIGFPE: floating-point exception",
+	/* 9 */	0, "SIGKILL: kill",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	T, "SIGSYS: bad system call",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	N, "SIGURG: urgent condition on socket",
+	/* 17 */	0, "SIGSTOP: stop",
+	/* 18 */	N+D, "SIGTSTP: keyboard stop",
+	/* 19 */	0, "SIGCONT: continue after stop",
+	/* 20 */	N, "SIGCHLD: child status has changed",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGIO: i/o now possible",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGINFO: status request from keyboard",
+	/* 30 */	N, "SIGUSR1: user-defined signal 1",
+	/* 31 */	N, "SIGUSR2: user-defined signal 2",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_netbsd.h b/src/runtime/signals_netbsd.h
new file mode 100644
index 000000000..950a2fe62
--- /dev/null
+++ b/src/runtime/signals_netbsd.h
@@ -0,0 +1,54 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/*  0 */	0, "SIGNONE: no trap",
+	/*  1 */	N+K, "SIGHUP: terminal line hangup",
+	/*  2 */	N+K, "SIGINT: interrupt",
+	/*  3 */	N+T, "SIGQUIT: quit",
+	/*  4 */	T, "SIGILL: illegal instruction",
+	/*  5 */	T, "SIGTRAP: trace trap",
+	/*  6 */	N+T, "SIGABRT: abort",
+	/*  7 */	T, "SIGEMT: emulate instruction executed",
+	/*  8 */	P, "SIGFPE: floating-point exception",
+	/*  9 */	0, "SIGKILL: kill",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	T, "SIGSYS: bad system call",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	N, "SIGURG: urgent condition on socket",
+	/* 17 */	0, "SIGSTOP: stop",
+	/* 18 */	N+D, "SIGTSTP: keyboard stop",
+	/* 19 */	0, "SIGCONT: continue after stop",
+	/* 20 */	N, "SIGCHLD: child status has changed",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGIO: i/o now possible",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGINFO: status request from keyboard",
+	/* 30 */	N, "SIGUSR1: user-defined signal 1",
+	/* 31 */	N, "SIGUSR2: user-defined signal 2",
+	/* 32 */	N, "SIGTHR: reserved",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_openbsd.h b/src/runtime/signals_openbsd.h
new file mode 100644
index 000000000..950a2fe62
--- /dev/null
+++ b/src/runtime/signals_openbsd.h
@@ -0,0 +1,54 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/*  0 */	0, "SIGNONE: no trap",
+	/*  1 */	N+K, "SIGHUP: terminal line hangup",
+	/*  2 */	N+K, "SIGINT: interrupt",
+	/*  3 */	N+T, "SIGQUIT: quit",
+	/*  4 */	T, "SIGILL: illegal instruction",
+	/*  5 */	T, "SIGTRAP: trace trap",
+	/*  6 */	N+T, "SIGABRT: abort",
+	/*  7 */	T, "SIGEMT: emulate instruction executed",
+	/*  8 */	P, "SIGFPE: floating-point exception",
+	/*  9 */	0, "SIGKILL: kill",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	T, "SIGSYS: bad system call",
+	/* 13 */	N, "SIGPIPE: write to broken pipe",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: termination",
+	/* 16 */	N, "SIGURG: urgent condition on socket",
+	/* 17 */	0, "SIGSTOP: stop",
+	/* 18 */	N+D, "SIGTSTP: keyboard stop",
+	/* 19 */	0, "SIGCONT: continue after stop",
+	/* 20 */	N, "SIGCHLD: child status has changed",
+	/* 21 */	N+D, "SIGTTIN: background read from tty",
+	/* 22 */	N+D, "SIGTTOU: background write to tty",
+	/* 23 */	N, "SIGIO: i/o now possible",
+	/* 24 */	N, "SIGXCPU: cpu limit exceeded",
+	/* 25 */	N, "SIGXFSZ: file size limit exceeded",
+	/* 26 */	N, "SIGVTALRM: virtual alarm clock",
+	/* 27 */	N, "SIGPROF: profiling alarm clock",
+	/* 28 */	N, "SIGWINCH: window size change",
+	/* 29 */	N, "SIGINFO: status request from keyboard",
+	/* 30 */	N, "SIGUSR1: user-defined signal 1",
+	/* 31 */	N, "SIGUSR2: user-defined signal 2",
+	/* 32 */	N, "SIGTHR: reserved",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_plan9.h b/src/runtime/signals_plan9.h
new file mode 100644
index 000000000..4ee8e542c
--- /dev/null
+++ b/src/runtime/signals_plan9.h
@@ -0,0 +1,63 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define E SigGoExit
+
+// Incoming notes are compared against this table using strncmp, so the
+// order matters: longer patterns must appear before their prefixes.
+// There are #defined SIG constants in os_plan9.h for the table index of
+// some of these.
+//
+// If you add entries to this table, you must respect the prefix ordering
+// and also update the constant values is os_plan9.h.
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	// Traps that we cannot be recovered.
+	T,	"sys: trap: debug exception",
+	T,	"sys: trap: invalid opcode",
+
+	// We can recover from some memory errors in runtime·sigpanic.
+	P,	"sys: trap: fault read addr",	// SIGRFAULT
+	P,	"sys: trap: fault write addr",	// SIGWFAULT
+
+	// We can also recover from math errors.
+	P,	"sys: trap: divide error",	// SIGINTDIV
+	P,	"sys: fp:",	// SIGFLOAT
+
+	// All other traps are normally handled as if they were marked SigThrow.
+	// We mark them SigPanic here so that debug.SetPanicOnFault will work.
+	P,	"sys: trap:",	// SIGTRAP
+
+	// Writes to a closed pipe can be handled if desired, otherwise they're ignored.
+	N,	"sys: write on closed pipe",
+
+	// Other system notes are more serious and cannot be recovered.
+	T,	"sys:",
+
+	// Issued to all other procs when calling runtime·exit.
+	E,	"go: exit ",
+
+	// Kill is sent by external programs to cause an exit.
+	K,	"kill",
+
+	// Interrupts can be handled if desired, otherwise they cause an exit.
+	N+K,	"interrupt",
+	N+K,	"hangup",
+
+	// Alarms can be handled if desired, otherwise they're ignored.
+	N,	"alarm",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef E
diff --git a/src/runtime/signals_solaris.h b/src/runtime/signals_solaris.h
new file mode 100644
index 000000000..1f0a65ea6
--- /dev/null
+++ b/src/runtime/signals_solaris.h
@@ -0,0 +1,97 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define N SigNotify
+#define K SigKill
+#define T SigThrow
+#define P SigPanic
+#define D SigDefault
+
+#pragma dataflag NOPTR
+SigTab runtime·sigtab[] = {
+	/* 0 */		0, "SIGNONE: no trap",
+	/* 1 */		N+K, "SIGHUP: hangup",
+	/* 2 */		N+K, "SIGINT: interrupt (rubout)",
+	/* 3 */		N+T, "SIGQUIT: quit (ASCII FS)",
+	/* 4 */		T, "SIGILL: illegal instruction (not reset when caught)",
+	/* 5 */		T, "SIGTRAP: trace trap (not reset when caught)",
+	/* 6 */		N+T, "SIGABRT: used by abort, replace SIGIOT in the future",
+	/* 7 */		T, "SIGEMT: EMT instruction",
+	/* 8 */		P, "SIGFPE: floating point exception",
+	/* 9 */		0, "SIGKILL: kill (cannot be caught or ignored)",
+	/* 10 */	P, "SIGBUS: bus error",
+	/* 11 */	P, "SIGSEGV: segmentation violation",
+	/* 12 */	T, "SIGSYS: bad argument to system call",
+	/* 13 */	N, "SIGPIPE: write on a pipe with no one to read it",
+	/* 14 */	N, "SIGALRM: alarm clock",
+	/* 15 */	N+K, "SIGTERM: software termination signal from kill",
+	/* 16 */	N, "SIGUSR1: user defined signal 1",
+	/* 17 */	N, "SIGUSR2: user defined signal 2",
+	/* 18 */	N, "SIGCLD: child status change",
+	/* 18 */	N, "SIGCHLD: child status change alias (POSIX)",
+	/* 19 */	N, "SIGPWR: power-fail restart",
+	/* 20 */	N, "SIGWINCH: window size change",
+	/* 21 */	N, "SIGURG: urgent socket condition",
+	/* 22 */	N, "SIGPOLL: pollable event occured",
+	/* 23 */	N+D, "SIGSTOP: stop (cannot be caught or ignored)",
+	/* 24 */	0, "SIGTSTP: user stop requested from tty",
+	/* 25 */	0, "SIGCONT: stopped process has been continued",
+	/* 26 */	N+D, "SIGTTIN: background tty read attempted",
+	/* 27 */	N+D, "SIGTTOU: background tty write attempted",
+	/* 28 */	N, "SIGVTALRM: virtual timer expired",
+	/* 29 */	N, "SIGPROF: profiling timer expired",
+	/* 30 */	N, "SIGXCPU: exceeded cpu limit",
+	/* 31 */	N, "SIGXFSZ: exceeded file size limit",
+	/* 32 */	N, "SIGWAITING: reserved signal no longer used by",
+	/* 33 */	N, "SIGLWP: reserved signal no longer used by",
+	/* 34 */	N, "SIGFREEZE: special signal used by CPR",
+	/* 35 */	N, "SIGTHAW: special signal used by CPR",
+	/* 36 */	0, "SIGCANCEL: reserved signal for thread cancellation",
+	/* 37 */	N, "SIGLOST: resource lost (eg, record-lock lost)",
+	/* 38 */	N, "SIGXRES: resource control exceeded",
+	/* 39 */	N, "SIGJVM1: reserved signal for Java Virtual Machine",
+	/* 40 */	N, "SIGJVM2: reserved signal for Java Virtual Machine",
+
+	/* TODO(aram): what should be do about these signals? D or N? is this set static? */
+	/* 41 */	N, "real time signal",
+	/* 42 */	N, "real time signal",
+	/* 43 */	N, "real time signal",
+	/* 44 */	N, "real time signal",
+	/* 45 */	N, "real time signal",
+	/* 46 */	N, "real time signal",
+	/* 47 */	N, "real time signal",
+	/* 48 */	N, "real time signal",
+	/* 49 */	N, "real time signal",
+	/* 50 */	N, "real time signal",
+	/* 51 */	N, "real time signal",
+	/* 52 */	N, "real time signal",
+	/* 53 */	N, "real time signal",
+	/* 54 */	N, "real time signal",
+	/* 55 */	N, "real time signal",
+	/* 56 */	N, "real time signal",
+	/* 57 */	N, "real time signal",
+	/* 58 */	N, "real time signal",
+	/* 59 */	N, "real time signal",
+	/* 60 */	N, "real time signal",
+	/* 61 */	N, "real time signal",
+	/* 62 */	N, "real time signal",
+	/* 63 */	N, "real time signal",
+	/* 64 */	N, "real time signal",
+	/* 65 */	N, "real time signal",
+	/* 66 */	N, "real time signal",
+	/* 67 */	N, "real time signal",
+	/* 68 */	N, "real time signal",
+	/* 69 */	N, "real time signal",
+	/* 70 */	N, "real time signal",
+	/* 71 */	N, "real time signal",
+	/* 72 */	N, "real time signal",
+};
+
+#undef N
+#undef K
+#undef T
+#undef P
+#undef D
diff --git a/src/runtime/signals_windows.h b/src/runtime/signals_windows.h
new file mode 100644
index 000000000..6943714b0
--- /dev/null
+++ b/src/runtime/signals_windows.h
@@ -0,0 +1,3 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
diff --git a/src/runtime/sigpanic_unix.go b/src/runtime/sigpanic_unix.go
new file mode 100644
index 000000000..68079859b
--- /dev/null
+++ b/src/runtime/sigpanic_unix.go
@@ -0,0 +1,40 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func signame(int32) *byte
+
+func sigpanic() {
+	g := getg()
+	if !canpanic(g) {
+		gothrow("unexpected signal during runtime execution")
+	}
+
+	switch g.sig {
+	case _SIGBUS:
+		if g.sigcode0 == _BUS_ADRERR && g.sigcode1 < 0x1000 || g.paniconfault {
+			panicmem()
+		}
+		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		gothrow("fault")
+	case _SIGSEGV:
+		if (g.sigcode0 == 0 || g.sigcode0 == _SEGV_MAPERR || g.sigcode0 == _SEGV_ACCERR) && g.sigcode1 < 0x1000 || g.paniconfault {
+			panicmem()
+		}
+		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		gothrow("fault")
+	case _SIGFPE:
+		switch g.sigcode0 {
+		case _FPE_INTDIV:
+			panicdivide()
+		case _FPE_INTOVF:
+			panicoverflow()
+		}
+		panicfloat()
+	}
+	panic(errorString(gostringnocopy(signame(g.sig))))
+}
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
new file mode 100644
index 000000000..2d9c24d2d
--- /dev/null
+++ b/src/runtime/sigqueue.go
@@ -0,0 +1,173 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file implements runtime support for signal handling.
+//
+// Most synchronization primitives are not available from
+// the signal handler (it cannot block, allocate memory, or use locks)
+// so the handler communicates with a processing goroutine
+// via struct sig, below.
+//
+// sigsend is called by the signal handler to queue a new signal.
+// signal_recv is called by the Go program to receive a newly queued signal.
+// Synchronization between sigsend and signal_recv is based on the sig.state
+// variable.  It can be in 3 states: sigIdle, sigReceiving and sigSending.
+// sigReceiving means that signal_recv is blocked on sig.Note and there are no
+// new pending signals.
+// sigSending means that sig.mask *may* contain new pending signals,
+// signal_recv can't be blocked in this state.
+// sigIdle means that there are no new pending signals and signal_recv is not blocked.
+// Transitions between states are done atomically with CAS.
+// When signal_recv is unblocked, it resets sig.Note and rechecks sig.mask.
+// If several sigsends and signal_recv execute concurrently, it can lead to
+// unnecessary rechecks of sig.mask, but it cannot lead to missed signals
+// nor deadlocks.
+
+package runtime
+
+import "unsafe"
+
+var sig struct {
+	note   note
+	mask   [(_NSIG + 31) / 32]uint32
+	wanted [(_NSIG + 31) / 32]uint32
+	recv   [(_NSIG + 31) / 32]uint32
+	state  uint32
+	inuse  bool
+}
+
+const (
+	sigIdle = iota
+	sigReceiving
+	sigSending
+)
+
+// Called from sighandler to send a signal back out of the signal handling thread.
+// Reports whether the signal was sent. If not, the caller typically crashes the program.
+func sigsend(s int32) bool {
+	bit := uint32(1) << uint(s&31)
+	if !sig.inuse || s < 0 || int(s) >= 32*len(sig.wanted) || sig.wanted[s/32]&bit == 0 {
+		return false
+	}
+
+	// Add signal to outgoing queue.
+	for {
+		mask := sig.mask[s/32]
+		if mask&bit != 0 {
+			return true // signal already in queue
+		}
+		if cas(&sig.mask[s/32], mask, mask|bit) {
+			break
+		}
+	}
+
+	// Notify receiver that queue has new bit.
+Send:
+	for {
+		switch atomicload(&sig.state) {
+		default:
+			gothrow("sigsend: inconsistent state")
+		case sigIdle:
+			if cas(&sig.state, sigIdle, sigSending) {
+				break Send
+			}
+		case sigSending:
+			// notification already pending
+			break Send
+		case sigReceiving:
+			if cas(&sig.state, sigReceiving, sigIdle) {
+				notewakeup(&sig.note)
+				break Send
+			}
+		}
+	}
+
+	return true
+}
+
+// Called to receive the next queued signal.
+// Must only be called from a single goroutine at a time.
+func signal_recv() uint32 {
+	for {
+		// Serve any signals from local copy.
+		for i := uint32(0); i < _NSIG; i++ {
+			if sig.recv[i/32]&(1<<(i&31)) != 0 {
+				sig.recv[i/32] &^= 1 << (i & 31)
+				return i
+			}
+		}
+
+		// Wait for updates to be available from signal sender.
+	Receive:
+		for {
+			switch atomicload(&sig.state) {
+			default:
+				gothrow("signal_recv: inconsistent state")
+			case sigIdle:
+				if cas(&sig.state, sigIdle, sigReceiving) {
+					notetsleepg(&sig.note, -1)
+					noteclear(&sig.note)
+					break Receive
+				}
+			case sigSending:
+				if cas(&sig.state, sigSending, sigIdle) {
+					break Receive
+				}
+			}
+		}
+
+		// Incorporate updates from sender into local copy.
+		for i := range sig.mask {
+			sig.recv[i] = xchg(&sig.mask[i], 0)
+		}
+	}
+}
+
+// Must only be called from a single goroutine at a time.
+func signal_enable(s uint32) {
+	if !sig.inuse {
+		// The first call to signal_enable is for us
+		// to use for initialization.  It does not pass
+		// signal information in m.
+		sig.inuse = true // enable reception of signals; cannot disable
+		noteclear(&sig.note)
+		return
+	}
+
+	if int(s) >= len(sig.wanted)*32 {
+		return
+	}
+	sig.wanted[s/32] |= 1 << (s & 31)
+	sigenable_go(s)
+}
+
+// Must only be called from a single goroutine at a time.
+func signal_disable(s uint32) {
+	if int(s) >= len(sig.wanted)*32 {
+		return
+	}
+	sig.wanted[s/32] &^= 1 << (s & 31)
+	sigdisable_go(s)
+}
+
+// This runs on a foreign stack, without an m or a g.  No stack split.
+//go:nosplit
+func badsignal(sig uintptr) {
+	cgocallback(unsafe.Pointer(funcPC(sigsend)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+}
+
+func sigenable_m()
+func sigdisable_m()
+
+func sigenable_go(s uint32) {
+	g := getg()
+	g.m.scalararg[0] = uintptr(s)
+	onM(sigenable_m)
+}
+
+func sigdisable_go(s uint32) {
+	g := getg()
+	g.m.scalararg[0] = uintptr(s)
+	onM(sigdisable_m)
+}
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
new file mode 100644
index 000000000..171087d7f
--- /dev/null
+++ b/src/runtime/slice.go
@@ -0,0 +1,139 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+type sliceStruct struct {
+	array unsafe.Pointer
+	len   int
+	cap   int
+}
+
+// TODO: take uintptrs instead of int64s?
+func makeslice(t *slicetype, len64 int64, cap64 int64) sliceStruct {
+	// NOTE: The len > MaxMem/elemsize check here is not strictly necessary,
+	// but it produces a 'len out of range' error instead of a 'cap out of range' error
+	// when someone does make([]T, bignumber). 'cap out of range' is true too,
+	// but since the cap is only being supplied implicitly, saying len is clearer.
+	// See issue 4085.
+	len := int(len64)
+	if len64 < 0 || int64(len) != len64 || t.elem.size > 0 && uintptr(len) > maxmem/uintptr(t.elem.size) {
+		panic(errorString("makeslice: len out of range"))
+	}
+	cap := int(cap64)
+	if cap < len || int64(cap) != cap64 || t.elem.size > 0 && uintptr(cap) > maxmem/uintptr(t.elem.size) {
+		panic(errorString("makeslice: cap out of range"))
+	}
+	p := newarray(t.elem, uintptr(cap))
+	return sliceStruct{p, len, cap}
+}
+
+// TODO: take uintptr instead of int64?
+func growslice(t *slicetype, old sliceStruct, n int64) sliceStruct {
+	if n < 1 {
+		panic(errorString("growslice: invalid n"))
+	}
+
+	cap64 := int64(old.cap) + n
+	cap := int(cap64)
+
+	if int64(cap) != cap64 || cap < old.cap || t.elem.size > 0 && uintptr(cap) > maxmem/uintptr(t.elem.size) {
+		panic(errorString("growslice: cap out of range"))
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racereadrangepc(old.array, uintptr(old.len*int(t.elem.size)), callerpc, funcPC(growslice))
+	}
+
+	et := t.elem
+	if et.size == 0 {
+		return sliceStruct{old.array, old.len, cap}
+	}
+
+	newcap := old.cap
+	if newcap+newcap < cap {
+		newcap = cap
+	} else {
+		for {
+			if old.len < 1024 {
+				newcap += newcap
+			} else {
+				newcap += newcap / 4
+			}
+			if newcap >= cap {
+				break
+			}
+		}
+	}
+
+	if uintptr(newcap) >= maxmem/uintptr(et.size) {
+		panic(errorString("growslice: cap out of range"))
+	}
+	lenmem := uintptr(old.len) * uintptr(et.size)
+	capmem := goroundupsize(uintptr(newcap) * uintptr(et.size))
+	newcap = int(capmem / uintptr(et.size))
+	var p unsafe.Pointer
+	if et.kind&kindNoPointers != 0 {
+		p = rawmem(capmem)
+		memclr(add(p, lenmem), capmem-lenmem)
+	} else {
+		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan unitialized memory
+		p = newarray(et, uintptr(newcap))
+	}
+	memmove(p, old.array, lenmem)
+
+	return sliceStruct{p, old.len, newcap}
+}
+
+func slicecopy(to sliceStruct, fm sliceStruct, width uintptr) int {
+	if fm.len == 0 || to.len == 0 || width == 0 {
+		return 0
+	}
+
+	n := fm.len
+	if to.len < n {
+		n = to.len
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&to))
+		pc := funcPC(slicecopy)
+		racewriterangepc(to.array, uintptr(n*int(width)), callerpc, pc)
+		racereadrangepc(fm.array, uintptr(n*int(width)), callerpc, pc)
+	}
+
+	size := uintptr(n) * width
+	if size == 1 { // common case worth about 2x to do here
+		// TODO: is this still worth it with new memmove impl?
+		*(*byte)(to.array) = *(*byte)(fm.array) // known to be a byte pointer
+	} else {
+		memmove(to.array, fm.array, size)
+	}
+	return int(n)
+}
+
+func slicestringcopy(to []byte, fm string) int {
+	if len(fm) == 0 || len(to) == 0 {
+		return 0
+	}
+
+	n := len(fm)
+	if len(to) < n {
+		n = len(to)
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&to))
+		pc := funcPC(slicestringcopy)
+		racewriterangepc(unsafe.Pointer(&to[0]), uintptr(n), callerpc, pc)
+	}
+
+	memmove(unsafe.Pointer(&to[0]), unsafe.Pointer((*stringStruct)(unsafe.Pointer(&fm)).str), uintptr(n))
+	return n
+}
diff --git a/src/runtime/softfloat64.go b/src/runtime/softfloat64.go
new file mode 100644
index 000000000..4fcf8f269
--- /dev/null
+++ b/src/runtime/softfloat64.go
@@ -0,0 +1,498 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Software IEEE754 64-bit floating point.
+// Only referred to (and thus linked in) by arm port
+// and by tests in this directory.
+
+package runtime
+
+const (
+	mantbits64 uint = 52
+	expbits64  uint = 11
+	bias64          = -1<<(expbits64-1) + 1
+
+	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1
+	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
+	neg64 uint64 = 1 << (expbits64 + mantbits64)
+
+	mantbits32 uint = 23
+	expbits32  uint = 8
+	bias32          = -1<<(expbits32-1) + 1
+
+	nan32 uint32 = (1<<expbits32-1)<<mantbits32 + 1
+	inf32 uint32 = (1<<expbits32 - 1) << mantbits32
+	neg32 uint32 = 1 << (expbits32 + mantbits32)
+)
+
+func funpack64(f uint64) (sign, mant uint64, exp int, inf, nan bool) {
+	sign = f & (1 << (mantbits64 + expbits64))
+	mant = f & (1<<mantbits64 - 1)
+	exp = int(f>>mantbits64) & (1<<expbits64 - 1)
+
+	switch exp {
+	case 1<<expbits64 - 1:
+		if mant != 0 {
+			nan = true
+			return
+		}
+		inf = true
+		return
+
+	case 0:
+		// denormalized
+		if mant != 0 {
+			exp += bias64 + 1
+			for mant < 1<<mantbits64 {
+				mant <<= 1
+				exp--
+			}
+		}
+
+	default:
+		// add implicit top bit
+		mant |= 1 << mantbits64
+		exp += bias64
+	}
+	return
+}
+
+func funpack32(f uint32) (sign, mant uint32, exp int, inf, nan bool) {
+	sign = f & (1 << (mantbits32 + expbits32))
+	mant = f & (1<<mantbits32 - 1)
+	exp = int(f>>mantbits32) & (1<<expbits32 - 1)
+
+	switch exp {
+	case 1<<expbits32 - 1:
+		if mant != 0 {
+			nan = true
+			return
+		}
+		inf = true
+		return
+
+	case 0:
+		// denormalized
+		if mant != 0 {
+			exp += bias32 + 1
+			for mant < 1<<mantbits32 {
+				mant <<= 1
+				exp--
+			}
+		}
+
+	default:
+		// add implicit top bit
+		mant |= 1 << mantbits32
+		exp += bias32
+	}
+	return
+}
+
+func fpack64(sign, mant uint64, exp int, trunc uint64) uint64 {
+	mant0, exp0, trunc0 := mant, exp, trunc
+	if mant == 0 {
+		return sign
+	}
+	for mant < 1<<mantbits64 {
+		mant <<= 1
+		exp--
+	}
+	for mant >= 4<<mantbits64 {
+		trunc |= mant & 1
+		mant >>= 1
+		exp++
+	}
+	if mant >= 2<<mantbits64 {
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+			if mant >= 4<<mantbits64 {
+				mant >>= 1
+				exp++
+			}
+		}
+		mant >>= 1
+		exp++
+	}
+	if exp >= 1<<expbits64-1+bias64 {
+		return sign ^ inf64
+	}
+	if exp < bias64+1 {
+		if exp < bias64-int(mantbits64) {
+			return sign | 0
+		}
+		// repeat expecting denormal
+		mant, exp, trunc = mant0, exp0, trunc0
+		for exp < bias64 {
+			trunc |= mant & 1
+			mant >>= 1
+			exp++
+		}
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+		}
+		mant >>= 1
+		exp++
+		if mant < 1<<mantbits64 {
+			return sign | mant
+		}
+	}
+	return sign | uint64(exp-bias64)<<mantbits64 | mant&(1<<mantbits64-1)
+}
+
+func fpack32(sign, mant uint32, exp int, trunc uint32) uint32 {
+	mant0, exp0, trunc0 := mant, exp, trunc
+	if mant == 0 {
+		return sign
+	}
+	for mant < 1<<mantbits32 {
+		mant <<= 1
+		exp--
+	}
+	for mant >= 4<<mantbits32 {
+		trunc |= mant & 1
+		mant >>= 1
+		exp++
+	}
+	if mant >= 2<<mantbits32 {
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+			if mant >= 4<<mantbits32 {
+				mant >>= 1
+				exp++
+			}
+		}
+		mant >>= 1
+		exp++
+	}
+	if exp >= 1<<expbits32-1+bias32 {
+		return sign ^ inf32
+	}
+	if exp < bias32+1 {
+		if exp < bias32-int(mantbits32) {
+			return sign | 0
+		}
+		// repeat expecting denormal
+		mant, exp, trunc = mant0, exp0, trunc0
+		for exp < bias32 {
+			trunc |= mant & 1
+			mant >>= 1
+			exp++
+		}
+		if mant&1 != 0 && (trunc != 0 || mant&2 != 0) {
+			mant++
+		}
+		mant >>= 1
+		exp++
+		if mant < 1<<mantbits32 {
+			return sign | mant
+		}
+	}
+	return sign | uint32(exp-bias32)<<mantbits32 | mant&(1<<mantbits32-1)
+}
+
+func fadd64(f, g uint64) uint64 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	gs, gm, ge, gi, gn := funpack64(g)
+
+	// Special cases.
+	switch {
+	case fn || gn: // NaN + x or x + NaN = NaN
+		return nan64
+
+	case fi && gi && fs != gs: // +Inf + -Inf or -Inf + +Inf = NaN
+		return nan64
+
+	case fi: // ±Inf + g = ±Inf
+		return f
+
+	case gi: // f + ±Inf = ±Inf
+		return g
+
+	case fm == 0 && gm == 0 && fs != 0 && gs != 0: // -0 + -0 = -0
+		return f
+
+	case fm == 0: // 0 + g = g but 0 + -0 = +0
+		if gm == 0 {
+			g ^= gs
+		}
+		return g
+
+	case gm == 0: // f + 0 = f
+		return f
+
+	}
+
+	if fe < ge || fe == ge && fm < gm {
+		f, g, fs, fm, fe, gs, gm, ge = g, f, gs, gm, ge, fs, fm, fe
+	}
+
+	shift := uint(fe - ge)
+	fm <<= 2
+	gm <<= 2
+	trunc := gm & (1<<shift - 1)
+	gm >>= shift
+	if fs == gs {
+		fm += gm
+	} else {
+		fm -= gm
+		if trunc != 0 {
+			fm--
+		}
+	}
+	if fm == 0 {
+		fs = 0
+	}
+	return fpack64(fs, fm, fe-2, trunc)
+}
+
+func fsub64(f, g uint64) uint64 {
+	return fadd64(f, fneg64(g))
+}
+
+func fneg64(f uint64) uint64 {
+	return f ^ (1 << (mantbits64 + expbits64))
+}
+
+func fmul64(f, g uint64) uint64 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	gs, gm, ge, gi, gn := funpack64(g)
+
+	// Special cases.
+	switch {
+	case fn || gn: // NaN * g or f * NaN = NaN
+		return nan64
+
+	case fi && gi: // Inf * Inf = Inf (with sign adjusted)
+		return f ^ gs
+
+	case fi && gm == 0, fm == 0 && gi: // 0 * Inf = Inf * 0 = NaN
+		return nan64
+
+	case fm == 0: // 0 * x = 0 (with sign adjusted)
+		return f ^ gs
+
+	case gm == 0: // x * 0 = 0 (with sign adjusted)
+		return g ^ fs
+	}
+
+	// 53-bit * 53-bit = 107- or 108-bit
+	lo, hi := mullu(fm, gm)
+	shift := mantbits64 - 1
+	trunc := lo & (1<<shift - 1)
+	mant := hi<<(64-shift) | lo>>shift
+	return fpack64(fs^gs, mant, fe+ge-1, trunc)
+}
+
+func fdiv64(f, g uint64) uint64 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	gs, gm, ge, gi, gn := funpack64(g)
+
+	// Special cases.
+	switch {
+	case fn || gn: // NaN / g = f / NaN = NaN
+		return nan64
+
+	case fi && gi: // ±Inf / ±Inf = NaN
+		return nan64
+
+	case !fi && !gi && fm == 0 && gm == 0: // 0 / 0 = NaN
+		return nan64
+
+	case fi, !gi && gm == 0: // Inf / g = f / 0 = Inf
+		return fs ^ gs ^ inf64
+
+	case gi, fm == 0: // f / Inf = 0 / g = Inf
+		return fs ^ gs ^ 0
+	}
+	_, _, _, _ = fi, fn, gi, gn
+
+	// 53-bit<<54 / 53-bit = 53- or 54-bit.
+	shift := mantbits64 + 2
+	q, r := divlu(fm>>(64-shift), fm<<shift, gm)
+	return fpack64(fs^gs, q, fe-ge-2, r)
+}
+
+func f64to32(f uint64) uint32 {
+	fs, fm, fe, fi, fn := funpack64(f)
+	if fn {
+		return nan32
+	}
+	fs32 := uint32(fs >> 32)
+	if fi {
+		return fs32 ^ inf32
+	}
+	const d = mantbits64 - mantbits32 - 1
+	return fpack32(fs32, uint32(fm>>d), fe-1, uint32(fm&(1<<d-1)))
+}
+
+func f32to64(f uint32) uint64 {
+	const d = mantbits64 - mantbits32
+	fs, fm, fe, fi, fn := funpack32(f)
+	if fn {
+		return nan64
+	}
+	fs64 := uint64(fs) << 32
+	if fi {
+		return fs64 ^ inf64
+	}
+	return fpack64(fs64, uint64(fm)<<d, fe, 0)
+}
+
+func fcmp64(f, g uint64) (cmp int, isnan bool) {
+	fs, fm, _, fi, fn := funpack64(f)
+	gs, gm, _, gi, gn := funpack64(g)
+
+	switch {
+	case fn, gn: // flag NaN
+		return 0, true
+
+	case !fi && !gi && fm == 0 && gm == 0: // ±0 == ±0
+		return 0, false
+
+	case fs > gs: // f < 0, g > 0
+		return -1, false
+
+	case fs < gs: // f > 0, g < 0
+		return +1, false
+
+	// Same sign, not NaN.
+	// Can compare encodings directly now.
+	// Reverse for sign.
+	case fs == 0 && f < g, fs != 0 && f > g:
+		return -1, false
+
+	case fs == 0 && f > g, fs != 0 && f < g:
+		return +1, false
+	}
+
+	// f == g
+	return 0, false
+}
+
+func f64toint(f uint64) (val int64, ok bool) {
+	fs, fm, fe, fi, fn := funpack64(f)
+
+	switch {
+	case fi, fn: // NaN
+		return 0, false
+
+	case fe < -1: // f < 0.5
+		return 0, false
+
+	case fe > 63: // f >= 2^63
+		if fs != 0 && fm == 0 { // f == -2^63
+			return -1 << 63, true
+		}
+		if fs != 0 {
+			return 0, false
+		}
+		return 0, false
+	}
+
+	for fe > int(mantbits64) {
+		fe--
+		fm <<= 1
+	}
+	for fe < int(mantbits64) {
+		fe++
+		fm >>= 1
+	}
+	val = int64(fm)
+	if fs != 0 {
+		val = -val
+	}
+	return val, true
+}
+
+func fintto64(val int64) (f uint64) {
+	fs := uint64(val) & (1 << 63)
+	mant := uint64(val)
+	if fs != 0 {
+		mant = -mant
+	}
+	return fpack64(fs, mant, int(mantbits64), 0)
+}
+
+// 64x64 -> 128 multiply.
+// adapted from hacker's delight.
+func mullu(u, v uint64) (lo, hi uint64) {
+	const (
+		s    = 32
+		mask = 1<<s - 1
+	)
+	u0 := u & mask
+	u1 := u >> s
+	v0 := v & mask
+	v1 := v >> s
+	w0 := u0 * v0
+	t := u1*v0 + w0>>s
+	w1 := t & mask
+	w2 := t >> s
+	w1 += u0 * v1
+	return u * v, u1*v1 + w2 + w1>>s
+}
+
+// 128/64 -> 64 quotient, 64 remainder.
+// adapted from hacker's delight
+func divlu(u1, u0, v uint64) (q, r uint64) {
+	const b = 1 << 32
+
+	if u1 >= v {
+		return 1<<64 - 1, 1<<64 - 1
+	}
+
+	// s = nlz(v); v <<= s
+	s := uint(0)
+	for v&(1<<63) == 0 {
+		s++
+		v <<= 1
+	}
+
+	vn1 := v >> 32
+	vn0 := v & (1<<32 - 1)
+	un32 := u1<<s | u0>>(64-s)
+	un10 := u0 << s
+	un1 := un10 >> 32
+	un0 := un10 & (1<<32 - 1)
+	q1 := un32 / vn1
+	rhat := un32 - q1*vn1
+
+again1:
+	if q1 >= b || q1*vn0 > b*rhat+un1 {
+		q1--
+		rhat += vn1
+		if rhat < b {
+			goto again1
+		}
+	}
+
+	un21 := un32*b + un1 - q1*v
+	q0 := un21 / vn1
+	rhat = un21 - q0*vn1
+
+again2:
+	if q0 >= b || q0*vn0 > b*rhat+un0 {
+		q0--
+		rhat += vn1
+		if rhat < b {
+			goto again2
+		}
+	}
+
+	return q1*b + q0, (un21*b + un0 - q0*v) >> s
+}
+
+// callable from C
+
+func fadd64c(f, g uint64, ret *uint64)            { *ret = fadd64(f, g) }
+func fsub64c(f, g uint64, ret *uint64)            { *ret = fsub64(f, g) }
+func fmul64c(f, g uint64, ret *uint64)            { *ret = fmul64(f, g) }
+func fdiv64c(f, g uint64, ret *uint64)            { *ret = fdiv64(f, g) }
+func fneg64c(f uint64, ret *uint64)               { *ret = fneg64(f) }
+func f32to64c(f uint32, ret *uint64)              { *ret = f32to64(f) }
+func f64to32c(f uint64, ret *uint32)              { *ret = f64to32(f) }
+func fcmp64c(f, g uint64, ret *int, retnan *bool) { *ret, *retnan = fcmp64(f, g) }
+func fintto64c(val int64, ret *uint64)            { *ret = fintto64(val) }
+func f64tointc(f uint64, ret *int64, retok *bool) { *ret, *retok = f64toint(f) }
diff --git a/src/runtime/softfloat64_test.go b/src/runtime/softfloat64_test.go
new file mode 100644
index 000000000..df63010fb
--- /dev/null
+++ b/src/runtime/softfloat64_test.go
@@ -0,0 +1,198 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math"
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+// turn uint64 op into float64 op
+func fop(f func(x, y uint64) uint64) func(x, y float64) float64 {
+	return func(x, y float64) float64 {
+		bx := math.Float64bits(x)
+		by := math.Float64bits(y)
+		return math.Float64frombits(f(bx, by))
+	}
+}
+
+func add(x, y float64) float64 { return x + y }
+func sub(x, y float64) float64 { return x - y }
+func mul(x, y float64) float64 { return x * y }
+func div(x, y float64) float64 { return x / y }
+
+func TestFloat64(t *testing.T) {
+	base := []float64{
+		0,
+		math.Copysign(0, -1),
+		-1,
+		1,
+		math.NaN(),
+		math.Inf(+1),
+		math.Inf(-1),
+		0.1,
+		1.5,
+		1.9999999999999998,     // all 1s mantissa
+		1.3333333333333333,     // 1.010101010101...
+		1.1428571428571428,     // 1.001001001001...
+		1.112536929253601e-308, // first normal
+		2,
+		4,
+		8,
+		16,
+		32,
+		64,
+		128,
+		256,
+		3,
+		12,
+		1234,
+		123456,
+		-0.1,
+		-1.5,
+		-1.9999999999999998,
+		-1.3333333333333333,
+		-1.1428571428571428,
+		-2,
+		-3,
+		1e-200,
+		1e-300,
+		1e-310,
+		5e-324,
+		1e-105,
+		1e-305,
+		1e+200,
+		1e+306,
+		1e+307,
+		1e+308,
+	}
+	all := make([]float64, 200)
+	copy(all, base)
+	for i := len(base); i < len(all); i++ {
+		all[i] = rand.NormFloat64()
+	}
+
+	test(t, "+", add, fop(Fadd64), all)
+	test(t, "-", sub, fop(Fsub64), all)
+	if GOARCH != "386" { // 386 is not precise!
+		test(t, "*", mul, fop(Fmul64), all)
+		test(t, "/", div, fop(Fdiv64), all)
+	}
+}
+
+// 64 -hw-> 32 -hw-> 64
+func trunc32(f float64) float64 {
+	return float64(float32(f))
+}
+
+// 64 -sw->32 -hw-> 64
+func to32sw(f float64) float64 {
+	return float64(math.Float32frombits(F64to32(math.Float64bits(f))))
+}
+
+// 64 -hw->32 -sw-> 64
+func to64sw(f float64) float64 {
+	return math.Float64frombits(F32to64(math.Float32bits(float32(f))))
+}
+
+// float64 -hw-> int64 -hw-> float64
+func hwint64(f float64) float64 {
+	return float64(int64(f))
+}
+
+// float64 -hw-> int32 -hw-> float64
+func hwint32(f float64) float64 {
+	return float64(int32(f))
+}
+
+// float64 -sw-> int64 -hw-> float64
+func toint64sw(f float64) float64 {
+	i, ok := F64toint(math.Float64bits(f))
+	if !ok {
+		// There's no right answer for out of range.
+		// Match the hardware to pass the test.
+		i = int64(f)
+	}
+	return float64(i)
+}
+
+// float64 -hw-> int64 -sw-> float64
+func fromint64sw(f float64) float64 {
+	return math.Float64frombits(Fintto64(int64(f)))
+}
+
+var nerr int
+
+func err(t *testing.T, format string, args ...interface{}) {
+	t.Errorf(format, args...)
+
+	// cut errors off after a while.
+	// otherwise we spend all our time
+	// allocating memory to hold the
+	// formatted output.
+	if nerr++; nerr >= 10 {
+		t.Fatal("too many errors")
+	}
+}
+
+func test(t *testing.T, op string, hw, sw func(float64, float64) float64, all []float64) {
+	for _, f := range all {
+		for _, g := range all {
+			h := hw(f, g)
+			s := sw(f, g)
+			if !same(h, s) {
+				err(t, "%g %s %g = sw %g, hw %g\n", f, op, g, s, h)
+			}
+			testu(t, "to32", trunc32, to32sw, h)
+			testu(t, "to64", trunc32, to64sw, h)
+			testu(t, "toint64", hwint64, toint64sw, h)
+			testu(t, "fromint64", hwint64, fromint64sw, h)
+			testcmp(t, f, h)
+			testcmp(t, h, f)
+			testcmp(t, g, h)
+			testcmp(t, h, g)
+		}
+	}
+}
+
+func testu(t *testing.T, op string, hw, sw func(float64) float64, v float64) {
+	h := hw(v)
+	s := sw(v)
+	if !same(h, s) {
+		err(t, "%s %g = sw %g, hw %g\n", op, v, s, h)
+	}
+}
+
+func hwcmp(f, g float64) (cmp int, isnan bool) {
+	switch {
+	case f < g:
+		return -1, false
+	case f > g:
+		return +1, false
+	case f == g:
+		return 0, false
+	}
+	return 0, true // must be NaN
+}
+
+func testcmp(t *testing.T, f, g float64) {
+	hcmp, hisnan := hwcmp(f, g)
+	scmp, sisnan := Fcmp64(math.Float64bits(f), math.Float64bits(g))
+	if hcmp != scmp || hisnan != sisnan {
+		err(t, "cmp(%g, %g) = sw %v, %v, hw %v, %v\n", f, g, scmp, sisnan, hcmp, hisnan)
+	}
+}
+
+func same(f, g float64) bool {
+	if math.IsNaN(f) && math.IsNaN(g) {
+		return true
+	}
+	if math.Copysign(1, f) != math.Copysign(1, g) {
+		return false
+	}
+	return f == g
+}
diff --git a/src/runtime/softfloat_arm.c b/src/runtime/softfloat_arm.c
new file mode 100644
index 000000000..3f3f33a19
--- /dev/null
+++ b/src/runtime/softfloat_arm.c
@@ -0,0 +1,687 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Software floating point interpretaton of ARM 7500 FP instructions.
+// The interpretation is not bit compatible with the 7500.
+// It uses true little-endian doubles, while the 7500 used mixed-endian.
+
+#include "runtime.h"
+#include "textflag.h"
+
+#define CPSR 14
+#define FLAGS_N (1U << 31)
+#define FLAGS_Z (1U << 30)
+#define FLAGS_C (1U << 29)
+#define FLAGS_V (1U << 28)
+
+void	runtime·abort(void);
+void	runtime·sqrtC(uint64, uint64*);
+
+static	uint32	trace = 0;
+
+static void
+fabort(void)
+{
+	if (1) {
+		runtime·printf("Unsupported floating point instruction\n");
+		runtime·abort();
+	}
+}
+
+static void
+putf(uint32 reg, uint32 val)
+{
+	g->m->freglo[reg] = val;
+}
+
+static void
+putd(uint32 reg, uint64 val)
+{
+	g->m->freglo[reg] = (uint32)val;
+	g->m->freghi[reg] = (uint32)(val>>32);
+}
+
+static uint64
+getd(uint32 reg)
+{
+	return (uint64)g->m->freglo[reg] | ((uint64)g->m->freghi[reg]<<32);
+}
+
+static void
+fprint(void)
+{
+	uint32 i;
+	for (i = 0; i < 16; i++) {
+		runtime·printf("\tf%d:\t%X %X\n", i, g->m->freghi[i], g->m->freglo[i]);
+	}
+}
+
+static uint32
+d2f(uint64 d)
+{
+	uint32 x;
+
+	runtime·f64to32c(d, &x);
+	return x;
+}
+
+static uint64
+f2d(uint32 f)
+{
+	uint64 x;
+
+	runtime·f32to64c(f, &x);
+	return x;
+}
+
+static uint32
+fstatus(bool nan, int32 cmp)
+{
+	if(nan)
+		return FLAGS_C | FLAGS_V;
+	if(cmp == 0)
+		return FLAGS_Z | FLAGS_C;
+	if(cmp < 0)
+		return FLAGS_N;
+	return FLAGS_C;
+}
+
+// conditions array record the required CPSR cond field for the
+// first 5 pairs of conditional execution opcodes
+// higher 4 bits are must set, lower 4 bits are must clear
+#pragma dataflag NOPTR
+static const uint8 conditions[10/2] = {
+	[0/2] = (FLAGS_Z >> 24) | 0, // 0: EQ (Z set), 1: NE (Z clear)
+	[2/2] = (FLAGS_C >> 24) | 0, // 2: CS/HS (C set), 3: CC/LO (C clear)
+	[4/2] = (FLAGS_N >> 24) | 0, // 4: MI (N set), 5: PL (N clear)
+	[6/2] = (FLAGS_V >> 24) | 0, // 6: VS (V set), 7: VC (V clear)
+	[8/2] = (FLAGS_C >> 24) | 
+	        (FLAGS_Z >> 28),     // 8: HI (C set and Z clear), 9: LS (C clear and Z set)
+};
+
+#define FAULT (0x80000000U) // impossible PC offset
+
+// returns number of words that the fp instruction
+// is occupying, 0 if next instruction isn't float.
+static uint32
+stepflt(uint32 *pc, uint32 *regs)
+{
+	uint32 i, opc, regd, regm, regn, cpsr;
+	int32 delta;
+	uint32 *addr;
+	uint64 uval;
+	int64 sval;
+	bool nan, ok;
+	int32 cmp;
+	M *m;
+
+	// m is locked in vlop_arm.s, so g->m cannot change during this function call,
+	// so caching it in a local variable is safe.
+	m = g->m;
+	i = *pc;
+
+	if(trace)
+		runtime·printf("stepflt %p %x (cpsr %x)\n", pc, i, regs[CPSR] >> 28);
+
+	opc = i >> 28;
+	if(opc == 14) // common case first
+		goto execute;
+	cpsr = regs[CPSR] >> 28;
+	switch(opc) {
+	case 0: case 1: case 2: case 3: case 4: 
+	case 5: case 6: case 7: case 8: case 9:
+		if(((cpsr & (conditions[opc/2] >> 4)) == (conditions[opc/2] >> 4)) &&
+		   ((cpsr & (conditions[opc/2] & 0xf)) == 0)) {
+			if(opc & 1) return 1;
+		} else {
+			if(!(opc & 1)) return 1;
+		}
+		break;
+	case 10: // GE (N == V)
+	case 11: // LT (N != V)
+		if((cpsr & (FLAGS_N >> 28)) == (cpsr & (FLAGS_V >> 28))) {
+			if(opc & 1) return 1;
+		} else {
+			if(!(opc & 1)) return 1;
+		}
+		break;
+	case 12: // GT (N == V and Z == 0)
+	case 13: // LE (N != V or Z == 1)
+		if((cpsr & (FLAGS_N >> 28)) == (cpsr & (FLAGS_V >> 28)) &&
+		   (cpsr & (FLAGS_Z >> 28)) == 0) {
+			if(opc & 1) return 1;
+		} else {
+			if(!(opc & 1)) return 1;
+		}
+		break;
+	case 14: // AL
+		break;
+	case 15: // shouldn't happen
+		return 0;
+	}
+	if(trace)
+		runtime·printf("conditional %x (cpsr %x) pass\n", opc, cpsr);
+	i = (0xeU << 28) | (i & 0xfffffff);
+
+execute:
+	// special cases
+	if((i&0xfffff000) == 0xe59fb000) {
+		// load r11 from pc-relative address.
+		// might be part of a floating point move
+		// (or might not, but no harm in simulating
+		// one instruction too many).
+		addr = (uint32*)((uint8*)pc + (i&0xfff) + 8);
+		regs[11] = addr[0];
+
+		if(trace)
+			runtime·printf("*** cpu R[%d] = *(%p) %x\n",
+				11, addr, regs[11]);
+		return 1;
+	}
+	if(i == 0xe08bb00d) {
+		// add sp to r11.
+		// might be part of a large stack offset address
+		// (or might not, but again no harm done).
+		regs[11] += regs[13];
+
+		if(trace)
+			runtime·printf("*** cpu R[%d] += R[%d] %x\n",
+				11, 13, regs[11]);
+		return 1;
+	}
+	if(i == 0xeef1fa10) {
+		regs[CPSR] = (regs[CPSR]&0x0fffffff) | m->fflag;
+
+		if(trace)
+			runtime·printf("*** fpsr R[CPSR] = F[CPSR] %x\n", regs[CPSR]);
+		return 1;
+	}
+	if((i&0xff000000) == 0xea000000) {
+		// unconditional branch
+		// can happen in the middle of floating point
+		// if the linker decides it is time to lay down
+		// a sequence of instruction stream constants.
+		delta = i&0xffffff;
+		delta = (delta<<8) >> 8;	// sign extend
+
+		if(trace)
+			runtime·printf("*** cpu PC += %x\n", (delta+2)*4);
+		return delta+2;
+	}
+
+	goto stage1;
+
+stage1:	// load/store regn is cpureg, regm is 8bit offset
+	regd = i>>12 & 0xf;
+	regn = i>>16 & 0xf;
+	regm = (i & 0xff) << 2;	// PLUS or MINUS ??
+
+	switch(i & 0xfff00f00) {
+	default:
+		goto stage2;
+
+	case 0xed900a00:	// single load
+		addr = (uint32*)(regs[regn] + regm);
+		if((uintptr)addr < 4096) {
+			if(trace)
+				runtime·printf("*** load @%p => fault\n", addr);
+			return FAULT;
+		}
+		m->freglo[regd] = addr[0];
+
+		if(trace)
+			runtime·printf("*** load F[%d] = %x\n",
+				regd, m->freglo[regd]);
+		break;
+
+	case 0xed900b00:	// double load
+		addr = (uint32*)(regs[regn] + regm);
+		if((uintptr)addr < 4096) {
+			if(trace)
+				runtime·printf("*** double load @%p => fault\n", addr);
+			return FAULT;
+		}
+		m->freglo[regd] = addr[0];
+		m->freghi[regd] = addr[1];
+
+		if(trace)
+			runtime·printf("*** load D[%d] = %x-%x\n",
+				regd, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xed800a00:	// single store
+		addr = (uint32*)(regs[regn] + regm);
+		if((uintptr)addr < 4096) {
+			if(trace)
+				runtime·printf("*** store @%p => fault\n", addr);
+			return FAULT;
+		}
+		addr[0] = m->freglo[regd];
+
+		if(trace)
+			runtime·printf("*** *(%p) = %x\n",
+				addr, addr[0]);
+		break;
+
+	case 0xed800b00:	// double store
+		addr = (uint32*)(regs[regn] + regm);
+		if((uintptr)addr < 4096) {
+			if(trace)
+				runtime·printf("*** double store @%p => fault\n", addr);
+			return FAULT;
+		}
+		addr[0] = m->freglo[regd];
+		addr[1] = m->freghi[regd];
+
+		if(trace)
+			runtime·printf("*** *(%p) = %x-%x\n",
+				addr, addr[1], addr[0]);
+		break;
+	}
+	return 1;
+
+stage2:	// regd, regm, regn are 4bit variables
+	regm = i>>0 & 0xf;
+	switch(i & 0xfff00ff0) {
+	default:
+		goto stage3;
+
+	case 0xf3000110:	// veor
+		m->freglo[regd] = m->freglo[regm]^m->freglo[regn];
+		m->freghi[regd] = m->freghi[regm]^m->freghi[regn];
+
+		if(trace)
+			runtime·printf("*** veor D[%d] = %x-%x\n",
+				regd, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb00b00:	// D[regd] = const(regn,regm)
+		regn = (regn<<4) | regm;
+		regm = 0x40000000UL;
+		if(regn & 0x80)
+			regm |= 0x80000000UL;
+		if(regn & 0x40)
+			regm ^= 0x7fc00000UL;
+		regm |= (regn & 0x3f) << 16;
+		m->freglo[regd] = 0;
+		m->freghi[regd] = regm;
+
+		if(trace)
+			runtime·printf("*** immed D[%d] = %x-%x\n",
+				regd, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb00a00:	// F[regd] = const(regn,regm)
+		regn = (regn<<4) | regm;
+		regm = 0x40000000UL;
+		if(regn & 0x80)
+			regm |= 0x80000000UL;
+		if(regn & 0x40)
+			regm ^= 0x7e000000UL;
+		regm |= (regn & 0x3f) << 19;
+		m->freglo[regd] = regm;
+
+		if(trace)
+			runtime·printf("*** immed D[%d] = %x\n",
+				regd, m->freglo[regd]);
+		break;
+
+	case 0xee300b00:	// D[regd] = D[regn]+D[regm]
+		runtime·fadd64c(getd(regn), getd(regm), &uval);
+		putd(regd, uval);
+
+		if(trace)
+			runtime·printf("*** add D[%d] = D[%d]+D[%d] %x-%x\n",
+				regd, regn, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xee300a00:	// F[regd] = F[regn]+F[regm]
+		runtime·fadd64c(f2d(m->freglo[regn]), f2d(m->freglo[regm]), &uval);
+		m->freglo[regd] = d2f(uval);
+
+		if(trace)
+			runtime·printf("*** add F[%d] = F[%d]+F[%d] %x\n",
+				regd, regn, regm, m->freglo[regd]);
+		break;
+
+	case 0xee300b40:	// D[regd] = D[regn]-D[regm]
+		runtime·fsub64c(getd(regn), getd(regm), &uval);
+		putd(regd, uval);
+
+		if(trace)
+			runtime·printf("*** sub D[%d] = D[%d]-D[%d] %x-%x\n",
+				regd, regn, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xee300a40:	// F[regd] = F[regn]-F[regm]
+		runtime·fsub64c(f2d(m->freglo[regn]), f2d(m->freglo[regm]), &uval);
+		m->freglo[regd] = d2f(uval);
+
+		if(trace)
+			runtime·printf("*** sub F[%d] = F[%d]-F[%d] %x\n",
+				regd, regn, regm, m->freglo[regd]);
+		break;
+
+	case 0xee200b00:	// D[regd] = D[regn]*D[regm]
+		runtime·fmul64c(getd(regn), getd(regm), &uval);
+		putd(regd, uval);
+
+		if(trace)
+			runtime·printf("*** mul D[%d] = D[%d]*D[%d] %x-%x\n",
+				regd, regn, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xee200a00:	// F[regd] = F[regn]*F[regm]
+		runtime·fmul64c(f2d(m->freglo[regn]), f2d(m->freglo[regm]), &uval);
+		m->freglo[regd] = d2f(uval);
+
+		if(trace)
+			runtime·printf("*** mul F[%d] = F[%d]*F[%d] %x\n",
+				regd, regn, regm, m->freglo[regd]);
+		break;
+
+	case 0xee800b00:	// D[regd] = D[regn]/D[regm]
+		runtime·fdiv64c(getd(regn), getd(regm), &uval);
+		putd(regd, uval);
+
+		if(trace)
+			runtime·printf("*** div D[%d] = D[%d]/D[%d] %x-%x\n",
+				regd, regn, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xee800a00:	// F[regd] = F[regn]/F[regm]
+		runtime·fdiv64c(f2d(m->freglo[regn]), f2d(m->freglo[regm]), &uval);
+		m->freglo[regd] = d2f(uval);
+
+		if(trace)
+			runtime·printf("*** div F[%d] = F[%d]/F[%d] %x\n",
+				regd, regn, regm, m->freglo[regd]);
+		break;
+
+	case 0xee000b10:	// S[regn] = R[regd] (MOVW) (regm ignored)
+		m->freglo[regn] = regs[regd];
+
+		if(trace)
+			runtime·printf("*** cpy S[%d] = R[%d] %x\n",
+				regn, regd, m->freglo[regn]);
+		break;
+
+	case 0xee100b10:	// R[regd] = S[regn] (MOVW) (regm ignored)
+		regs[regd] = m->freglo[regn];
+
+		if(trace)
+			runtime·printf("*** cpy R[%d] = S[%d] %x\n",
+				regd, regn, regs[regd]);
+		break;
+	}
+	return 1;
+
+stage3:	// regd, regm are 4bit variables
+	switch(i & 0xffff0ff0) {
+	default:
+		goto done;
+
+	case 0xeeb00a40:	// F[regd] = F[regm] (MOVF)
+		m->freglo[regd] = m->freglo[regm];
+
+		if(trace)
+			runtime·printf("*** F[%d] = F[%d] %x\n",
+				regd, regm, m->freglo[regd]);
+		break;
+
+	case 0xeeb00b40:	// D[regd] = D[regm] (MOVD)
+		m->freglo[regd] = m->freglo[regm];
+		m->freghi[regd] = m->freghi[regm];
+
+		if(trace)
+			runtime·printf("*** D[%d] = D[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb10bc0:	// D[regd] = sqrt D[regm]
+		runtime·sqrtC(getd(regm), &uval);
+		putd(regd, uval);
+
+		if(trace)
+			runtime·printf("*** D[%d] = sqrt D[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb00bc0:	// D[regd] = abs D[regm]
+		m->freglo[regd] = m->freglo[regm];
+		m->freghi[regd] = m->freghi[regm] & ((1<<31)-1);
+
+		if(trace)
+			runtime·printf("*** D[%d] = abs D[%d] %x-%x\n",
+					regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb00ac0:	// F[regd] = abs F[regm]
+		m->freglo[regd] = m->freglo[regm] & ((1<<31)-1);
+
+		if(trace)
+			runtime·printf("*** F[%d] = abs F[%d] %x\n",
+					regd, regm, m->freglo[regd]);
+		break;
+
+	case 0xeeb40bc0:	// D[regd] :: D[regm] (CMPD)
+		runtime·fcmp64c(getd(regd), getd(regm), &cmp, &nan);
+		m->fflag = fstatus(nan, cmp);
+
+		if(trace)
+			runtime·printf("*** cmp D[%d]::D[%d] %x\n",
+				regd, regm, m->fflag);
+		break;
+
+	case 0xeeb40ac0:	// F[regd] :: F[regm] (CMPF)
+		runtime·fcmp64c(f2d(m->freglo[regd]), f2d(m->freglo[regm]), &cmp, &nan);
+		m->fflag = fstatus(nan, cmp);
+
+		if(trace)
+			runtime·printf("*** cmp F[%d]::F[%d] %x\n",
+				regd, regm, m->fflag);
+		break;
+
+	case 0xeeb70ac0:	// D[regd] = F[regm] (MOVFD)
+		putd(regd, f2d(m->freglo[regm]));
+
+		if(trace)
+			runtime·printf("*** f2d D[%d]=F[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb70bc0:	// F[regd] = D[regm] (MOVDF)
+		m->freglo[regd] = d2f(getd(regm));
+
+		if(trace)
+			runtime·printf("*** d2f F[%d]=D[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeebd0ac0:	// S[regd] = F[regm] (MOVFW)
+		runtime·f64tointc(f2d(m->freglo[regm]), &sval, &ok);
+		if(!ok || (int32)sval != sval)
+			sval = 0;
+		m->freglo[regd] = sval;
+
+		if(trace)
+			runtime·printf("*** fix S[%d]=F[%d] %x\n",
+				regd, regm, m->freglo[regd]);
+		break;
+
+	case 0xeebc0ac0:	// S[regd] = F[regm] (MOVFW.U)
+		runtime·f64tointc(f2d(m->freglo[regm]), &sval, &ok);
+		if(!ok || (uint32)sval != sval)
+			sval = 0;
+		m->freglo[regd] = sval;
+
+		if(trace)
+			runtime·printf("*** fix unsigned S[%d]=F[%d] %x\n",
+				regd, regm, m->freglo[regd]);
+		break;
+
+	case 0xeebd0bc0:	// S[regd] = D[regm] (MOVDW)
+		runtime·f64tointc(getd(regm), &sval, &ok);
+		if(!ok || (int32)sval != sval)
+			sval = 0;
+		m->freglo[regd] = sval;
+
+		if(trace)
+			runtime·printf("*** fix S[%d]=D[%d] %x\n",
+				regd, regm, m->freglo[regd]);
+		break;
+
+	case 0xeebc0bc0:	// S[regd] = D[regm] (MOVDW.U)
+		runtime·f64tointc(getd(regm), &sval, &ok);
+		if(!ok || (uint32)sval != sval)
+			sval = 0;
+		m->freglo[regd] = sval;
+
+		if(trace)
+			runtime·printf("*** fix unsigned S[%d]=D[%d] %x\n",
+				regd, regm, m->freglo[regd]);
+		break;
+
+	case 0xeeb80ac0:	// D[regd] = S[regm] (MOVWF)
+		cmp = m->freglo[regm];
+		if(cmp < 0) {
+			runtime·fintto64c(-cmp, &uval);
+			putf(regd, d2f(uval));
+			m->freglo[regd] ^= 0x80000000;
+		} else {
+			runtime·fintto64c(cmp, &uval);
+			putf(regd, d2f(uval));
+		}
+
+		if(trace)
+			runtime·printf("*** float D[%d]=S[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb80a40:	// D[regd] = S[regm] (MOVWF.U)
+		runtime·fintto64c(m->freglo[regm], &uval);
+		putf(regd, d2f(uval));
+
+		if(trace)
+			runtime·printf("*** float unsigned D[%d]=S[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb80bc0:	// D[regd] = S[regm] (MOVWD)
+		cmp = m->freglo[regm];
+		if(cmp < 0) {
+			runtime·fintto64c(-cmp, &uval);
+			putd(regd, uval);
+			m->freghi[regd] ^= 0x80000000;
+		} else {
+			runtime·fintto64c(cmp, &uval);
+			putd(regd, uval);
+		}
+
+		if(trace)
+			runtime·printf("*** float D[%d]=S[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb80b40:	// D[regd] = S[regm] (MOVWD.U)
+		runtime·fintto64c(m->freglo[regm], &uval);
+		putd(regd, uval);
+
+		if(trace)
+			runtime·printf("*** float unsigned D[%d]=S[%d] %x-%x\n",
+				regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+	}
+	return 1;
+
+done:
+	if((i&0xff000000) == 0xee000000 ||
+	   (i&0xff000000) == 0xed000000) {
+		runtime·printf("stepflt %p %x\n", pc, i);
+		fabort();
+	}
+	return 0;
+}
+
+typedef struct Sfregs Sfregs;
+
+// NOTE: These are all recorded as pointers because they are possibly live registers,
+// and we don't know what they contain. Recording them as pointers should be
+// safer than not.
+struct Sfregs
+{
+	uint32 *r0;
+	uint32 *r1;
+	uint32 *r2;
+	uint32 *r3;
+	uint32 *r4;
+	uint32 *r5;
+	uint32 *r6;
+	uint32 *r7;
+	uint32 *r8;
+	uint32 *r9;
+	uint32 *r10;
+	uint32 *r11;
+	uint32 *r12;
+	uint32 *r13;
+	uint32 cspr;
+};
+
+static void sfloat2(void);
+void _sfloatpanic(void);
+
+#pragma textflag NOSPLIT
+uint32*
+runtime·_sfloat2(uint32 *pc, Sfregs regs)
+{
+	void (*fn)(void);
+	
+	g->m->ptrarg[0] = pc;
+	g->m->ptrarg[1] = &regs;
+	fn = sfloat2;
+	runtime·onM(&fn);
+	pc = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	return pc;
+}
+
+static void
+sfloat2(void)
+{
+	uint32 *pc;
+	G *curg;
+	Sfregs *regs;
+	int32 skip;
+	bool first;
+	
+	pc = g->m->ptrarg[0];
+	regs = g->m->ptrarg[1];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+
+	first = true;
+	while(skip = stepflt(pc, (uint32*)&regs->r0)) {
+		first = false;
+		if(skip == FAULT) {
+			// Encountered bad address in store/load.
+			// Record signal information and return to assembly
+			// trampoline that fakes the call.
+			enum { SIGSEGV = 11 };
+			curg = g->m->curg;
+			curg->sig = SIGSEGV;
+			curg->sigcode0 = 0;
+			curg->sigcode1 = 0;
+			curg->sigpc = (uint32)pc;
+			pc = (uint32*)_sfloatpanic;
+			break;
+		}
+		pc += skip;
+	}
+	if(first) {
+		runtime·printf("sfloat2 %p %x\n", pc, *pc);
+		fabort(); // not ok to fail first instruction
+	}
+		
+	g->m->ptrarg[0] = pc;
+}
diff --git a/src/runtime/sqrt.go b/src/runtime/sqrt.go
new file mode 100644
index 000000000..34a8c3806
--- /dev/null
+++ b/src/runtime/sqrt.go
@@ -0,0 +1,150 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copy of math/sqrt.go, here for use by ARM softfloat.
+
+package runtime
+
+import "unsafe"
+
+// The original C code and the long comment below are
+// from FreeBSD's /usr/src/lib/msun/src/e_sqrt.c and
+// came with this notice.  The go code is a simplified
+// version of the original C.
+//
+// ====================================================
+// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+//
+// Developed at SunPro, a Sun Microsystems, Inc. business.
+// Permission to use, copy, modify, and distribute this
+// software is freely granted, provided that this notice
+// is preserved.
+// ====================================================
+//
+// __ieee754_sqrt(x)
+// Return correctly rounded sqrt.
+//           -----------------------------------------
+//           | Use the hardware sqrt if you have one |
+//           -----------------------------------------
+// Method:
+//   Bit by bit method using integer arithmetic. (Slow, but portable)
+//   1. Normalization
+//      Scale x to y in [1,4) with even powers of 2:
+//      find an integer k such that  1 <= (y=x*2**(2k)) < 4, then
+//              sqrt(x) = 2**k * sqrt(y)
+//   2. Bit by bit computation
+//      Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
+//           i                                                   0
+//                                     i+1         2
+//          s  = 2*q , and      y  =  2   * ( y - q  ).          (1)
+//           i      i            i                 i
+//
+//      To compute q    from q , one checks whether
+//                  i+1       i
+//
+//                            -(i+1) 2
+//                      (q + 2      )  <= y.                     (2)
+//                        i
+//                                                            -(i+1)
+//      If (2) is false, then q   = q ; otherwise q   = q  + 2      .
+//                             i+1   i             i+1   i
+//
+//      With some algebraic manipulation, it is not difficult to see
+//      that (2) is equivalent to
+//                             -(i+1)
+//                      s  +  2       <= y                       (3)
+//                       i                i
+//
+//      The advantage of (3) is that s  and y  can be computed by
+//                                    i      i
+//      the following recurrence formula:
+//          if (3) is false
+//
+//          s     =  s  ,       y    = y   ;                     (4)
+//           i+1      i          i+1    i
+//
+//      otherwise,
+//                         -i                      -(i+1)
+//          s     =  s  + 2  ,  y    = y  -  s  - 2              (5)
+//           i+1      i          i+1    i     i
+//
+//      One may easily use induction to prove (4) and (5).
+//      Note. Since the left hand side of (3) contain only i+2 bits,
+//            it does not necessary to do a full (53-bit) comparison
+//            in (3).
+//   3. Final rounding
+//      After generating the 53 bits result, we compute one more bit.
+//      Together with the remainder, we can decide whether the
+//      result is exact, bigger than 1/2ulp, or less than 1/2ulp
+//      (it will never equal to 1/2ulp).
+//      The rounding mode can be detected by checking whether
+//      huge + tiny is equal to huge, and whether huge - tiny is
+//      equal to huge for some floating point number "huge" and "tiny".
+//
+//
+// Notes:  Rounding mode detection omitted.
+
+const (
+	uvnan      = 0x7FF8000000000001
+	uvinf      = 0x7FF0000000000000
+	uvneginf   = 0xFFF0000000000000
+	mask       = 0x7FF
+	shift      = 64 - 11 - 1
+	bias       = 1023
+	maxFloat64 = 1.797693134862315708145274237317043567981e+308 // 2**1023 * (2**53 - 1) / 2**52
+)
+
+func float64bits(f float64) uint64     { return *(*uint64)(unsafe.Pointer(&f)) }
+func float64frombits(b uint64) float64 { return *(*float64)(unsafe.Pointer(&b)) }
+
+func sqrt(x float64) float64 {
+	// special cases
+	switch {
+	case x == 0 || x != x || x > maxFloat64:
+		return x
+	case x < 0:
+		return nan
+	}
+	ix := float64bits(x)
+	// normalize x
+	exp := int((ix >> shift) & mask)
+	if exp == 0 { // subnormal x
+		for ix&1<<shift == 0 {
+			ix <<= 1
+			exp--
+		}
+		exp++
+	}
+	exp -= bias // unbias exponent
+	ix &^= mask << shift
+	ix |= 1 << shift
+	if exp&1 == 1 { // odd exp, double x to make it even
+		ix <<= 1
+	}
+	exp >>= 1 // exp = exp/2, exponent of square root
+	// generate sqrt(x) bit by bit
+	ix <<= 1
+	var q, s uint64               // q = sqrt(x)
+	r := uint64(1 << (shift + 1)) // r = moving bit from MSB to LSB
+	for r != 0 {
+		t := s + r
+		if t <= ix {
+			s = t + r
+			ix -= t
+			q += r
+		}
+		ix <<= 1
+		r >>= 1
+	}
+	// final rounding
+	if ix != 0 { // remainder, result not exact
+		q += q & 1 // round according to extra bit
+	}
+	ix = q>>1 + uint64(exp-1+bias)<<shift // significand + biased exponent
+	return float64frombits(ix)
+}
+
+func sqrtC(f float64, r *float64) {
+	*r = sqrt(f)
+}
diff --git a/src/runtime/stack.c b/src/runtime/stack.c
new file mode 100644
index 000000000..cb9557243
--- /dev/null
+++ b/src/runtime/stack.c
@@ -0,0 +1,892 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "stack.h"
+#include "funcdata.h"
+#include "typekind.h"
+#include "type.h"
+#include "race.h"
+#include "mgc0.h"
+#include "textflag.h"
+
+enum
+{
+	// StackDebug == 0: no logging
+	//            == 1: logging of per-stack operations
+	//            == 2: logging of per-frame operations
+	//            == 3: logging of per-word updates
+	//            == 4: logging of per-word reads
+	StackDebug = 0,
+	StackFromSystem = 0,	// allocate stacks from system memory instead of the heap
+	StackFaultOnFree = 0,	// old stacks are mapped noaccess to detect use after free
+	StackPoisonCopy = 0,	// fill stack that should not be accessed with garbage, to detect bad dereferences during copy
+
+	StackCache = 1,
+};
+
+// Global pool of spans that have free stacks.
+// Stacks are assigned an order according to size.
+//     order = log_2(size/FixedStack)
+// There is a free list for each order.
+MSpan runtime·stackpool[NumStackOrders];
+Mutex runtime·stackpoolmu;
+// TODO: one lock per order?
+
+static Stack stackfreequeue;
+
+void
+runtime·stackinit(void)
+{
+	int32 i;
+
+	if((StackCacheSize & PageMask) != 0)
+		runtime·throw("cache size must be a multiple of page size");
+
+	for(i = 0; i < NumStackOrders; i++)
+		runtime·MSpanList_Init(&runtime·stackpool[i]);
+}
+
+// Allocates a stack from the free pool.  Must be called with
+// stackpoolmu held.
+static MLink*
+poolalloc(uint8 order)
+{
+	MSpan *list;
+	MSpan *s;
+	MLink *x;
+	uintptr i;
+
+	list = &runtime·stackpool[order];
+	s = list->next;
+	if(s == list) {
+		// no free stacks.  Allocate another span worth.
+		s = runtime·MHeap_AllocStack(&runtime·mheap, StackCacheSize >> PageShift);
+		if(s == nil)
+			runtime·throw("out of memory");
+		if(s->ref != 0)
+			runtime·throw("bad ref");
+		if(s->freelist != nil)
+			runtime·throw("bad freelist");
+		for(i = 0; i < StackCacheSize; i += FixedStack << order) {
+			x = (MLink*)((s->start << PageShift) + i);
+			x->next = s->freelist;
+			s->freelist = x;
+		}
+		runtime·MSpanList_Insert(list, s);
+	}
+	x = s->freelist;
+	if(x == nil)
+		runtime·throw("span has no free stacks");
+	s->freelist = x->next;
+	s->ref++;
+	if(s->freelist == nil) {
+		// all stacks in s are allocated.
+		runtime·MSpanList_Remove(s);
+	}
+	return x;
+}
+
+// Adds stack x to the free pool.  Must be called with stackpoolmu held.
+static void
+poolfree(MLink *x, uint8 order)
+{
+	MSpan *s;
+
+	s = runtime·MHeap_Lookup(&runtime·mheap, x);
+	if(s->state != MSpanStack)
+		runtime·throw("freeing stack not in a stack span");
+	if(s->freelist == nil) {
+		// s will now have a free stack
+		runtime·MSpanList_Insert(&runtime·stackpool[order], s);
+	}
+	x->next = s->freelist;
+	s->freelist = x;
+	s->ref--;
+	if(s->ref == 0) {
+		// span is completely free - return to heap
+		runtime·MSpanList_Remove(s);
+		s->freelist = nil;
+		runtime·MHeap_FreeStack(&runtime·mheap, s);
+	}
+}
+
+// stackcacherefill/stackcacherelease implement a global pool of stack segments.
+// The pool is required to prevent unlimited growth of per-thread caches.
+static void
+stackcacherefill(MCache *c, uint8 order)
+{
+	MLink *x, *list;
+	uintptr size;
+
+	if(StackDebug >= 1)
+		runtime·printf("stackcacherefill order=%d\n", order);
+
+	// Grab some stacks from the global cache.
+	// Grab half of the allowed capacity (to prevent thrashing).
+	list = nil;
+	size = 0;
+	runtime·lock(&runtime·stackpoolmu);
+	while(size < StackCacheSize/2) {
+		x = poolalloc(order);
+		x->next = list;
+		list = x;
+		size += FixedStack << order;
+	}
+	runtime·unlock(&runtime·stackpoolmu);
+
+	c->stackcache[order].list = list;
+	c->stackcache[order].size = size;
+}
+
+static void
+stackcacherelease(MCache *c, uint8 order)
+{
+	MLink *x, *y;
+	uintptr size;
+
+	if(StackDebug >= 1)
+		runtime·printf("stackcacherelease order=%d\n", order);
+	x = c->stackcache[order].list;
+	size = c->stackcache[order].size;
+	runtime·lock(&runtime·stackpoolmu);
+	while(size > StackCacheSize/2) {
+		y = x->next;
+		poolfree(x, order);
+		x = y;
+		size -= FixedStack << order;
+	}
+	runtime·unlock(&runtime·stackpoolmu);
+	c->stackcache[order].list = x;
+	c->stackcache[order].size = size;
+}
+
+void
+runtime·stackcache_clear(MCache *c)
+{
+	uint8 order;
+	MLink *x, *y;
+
+	if(StackDebug >= 1)
+		runtime·printf("stackcache clear\n");
+	runtime·lock(&runtime·stackpoolmu);
+	for(order = 0; order < NumStackOrders; order++) {
+		x = c->stackcache[order].list;
+		while(x != nil) {
+			y = x->next;
+			poolfree(x, order);
+			x = y;
+		}
+		c->stackcache[order].list = nil;
+		c->stackcache[order].size = 0;
+	}
+	runtime·unlock(&runtime·stackpoolmu);
+}
+
+Stack
+runtime·stackalloc(uint32 n)
+{
+	uint8 order;
+	uint32 n2;
+	void *v;
+	MLink *x;
+	MSpan *s;
+	MCache *c;
+
+	// Stackalloc must be called on scheduler stack, so that we
+	// never try to grow the stack during the code that stackalloc runs.
+	// Doing so would cause a deadlock (issue 1547).
+	if(g != g->m->g0)
+		runtime·throw("stackalloc not on scheduler stack");
+	if((n & (n-1)) != 0)
+		runtime·throw("stack size not a power of 2");
+	if(StackDebug >= 1)
+		runtime·printf("stackalloc %d\n", n);
+
+	if(runtime·debug.efence || StackFromSystem) {
+		v = runtime·sysAlloc(ROUND(n, PageSize), &mstats.stacks_sys);
+		if(v == nil)
+			runtime·throw("out of memory (stackalloc)");
+		return (Stack){(uintptr)v, (uintptr)v+n};
+	}
+
+	// Small stacks are allocated with a fixed-size free-list allocator.
+	// If we need a stack of a bigger size, we fall back on allocating
+	// a dedicated span.
+	if(StackCache && n < FixedStack << NumStackOrders && n < StackCacheSize) {
+		order = 0;
+		n2 = n;
+		while(n2 > FixedStack) {
+			order++;
+			n2 >>= 1;
+		}
+		c = g->m->mcache;
+		if(c == nil || g->m->gcing || g->m->helpgc) {
+			// c == nil can happen in the guts of exitsyscall or
+			// procresize. Just get a stack from the global pool.
+			// Also don't touch stackcache during gc
+			// as it's flushed concurrently.
+			runtime·lock(&runtime·stackpoolmu);
+			x = poolalloc(order);
+			runtime·unlock(&runtime·stackpoolmu);
+		} else {
+			x = c->stackcache[order].list;
+			if(x == nil) {
+				stackcacherefill(c, order);
+				x = c->stackcache[order].list;
+			}
+			c->stackcache[order].list = x->next;
+			c->stackcache[order].size -= n;
+		}
+		v = (byte*)x;
+	} else {
+		s = runtime·MHeap_AllocStack(&runtime·mheap, ROUND(n, PageSize) >> PageShift);
+		if(s == nil)
+			runtime·throw("out of memory");
+		v = (byte*)(s->start<<PageShift);
+	}
+	
+	if(raceenabled)
+		runtime·racemalloc(v, n);
+	if(StackDebug >= 1)
+		runtime·printf("  allocated %p\n", v);
+	return (Stack){(uintptr)v, (uintptr)v+n};
+}
+
+void
+runtime·stackfree(Stack stk)
+{
+	uint8 order;
+	uintptr n, n2;
+	MSpan *s;
+	MLink *x;
+	MCache *c;
+	void *v;
+	
+	n = stk.hi - stk.lo;
+	v = (void*)stk.lo;
+	if(n & (n-1))
+		runtime·throw("stack not a power of 2");
+	if(StackDebug >= 1) {
+		runtime·printf("stackfree %p %d\n", v, (int32)n);
+		runtime·memclr(v, n); // for testing, clobber stack data
+	}
+	if(runtime·debug.efence || StackFromSystem) {
+		if(runtime·debug.efence || StackFaultOnFree)
+			runtime·SysFault(v, n);
+		else
+			runtime·SysFree(v, n, &mstats.stacks_sys);
+		return;
+	}
+	if(StackCache && n < FixedStack << NumStackOrders && n < StackCacheSize) {
+		order = 0;
+		n2 = n;
+		while(n2 > FixedStack) {
+			order++;
+			n2 >>= 1;
+		}
+		x = (MLink*)v;
+		c = g->m->mcache;
+		if(c == nil || g->m->gcing || g->m->helpgc) {
+			runtime·lock(&runtime·stackpoolmu);
+			poolfree(x, order);
+			runtime·unlock(&runtime·stackpoolmu);
+		} else {
+			if(c->stackcache[order].size >= StackCacheSize)
+				stackcacherelease(c, order);
+			x->next = c->stackcache[order].list;
+			c->stackcache[order].list = x;
+			c->stackcache[order].size += n;
+		}
+	} else {
+		s = runtime·MHeap_Lookup(&runtime·mheap, v);
+		if(s->state != MSpanStack) {
+			runtime·printf("%p %p\n", s->start<<PageShift, v);
+			runtime·throw("bad span state");
+		}
+		runtime·MHeap_FreeStack(&runtime·mheap, s);
+	}
+}
+
+uintptr runtime·maxstacksize = 1<<20; // enough until runtime.main sets it for real
+
+static uint8*
+mapnames[] = {
+	(uint8*)"---",
+	(uint8*)"scalar",
+	(uint8*)"ptr",
+	(uint8*)"multi",
+};
+
+// Stack frame layout
+//
+// (x86)
+// +------------------+
+// | args from caller |
+// +------------------+ <- frame->argp
+// |  return address  |
+// +------------------+ <- frame->varp
+// |     locals       |
+// +------------------+
+// |  args to callee  |
+// +------------------+ <- frame->sp
+//
+// (arm)
+// +------------------+
+// | args from caller |
+// +------------------+ <- frame->argp
+// | caller's retaddr |
+// +------------------+ <- frame->varp
+// |     locals       |
+// +------------------+
+// |  args to callee  |
+// +------------------+
+// |  return address  |
+// +------------------+ <- frame->sp
+
+void runtime·main(void);
+void runtime·switchtoM(void(*)(void));
+
+typedef struct AdjustInfo AdjustInfo;
+struct AdjustInfo {
+	Stack old;
+	uintptr delta;  // ptr distance from old to new stack (newbase - oldbase)
+};
+
+// Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
+// If so, it rewrites *vpp to point into the new stack.
+static void
+adjustpointer(AdjustInfo *adjinfo, void *vpp)
+{
+	byte **pp, *p;
+	
+	pp = vpp;
+	p = *pp;
+	if(StackDebug >= 4)
+		runtime·printf("        %p:%p\n", pp, p);
+	if(adjinfo->old.lo <= (uintptr)p && (uintptr)p < adjinfo->old.hi) {
+		*pp = p + adjinfo->delta;
+		if(StackDebug >= 3)
+			runtime·printf("        adjust ptr %p: %p -> %p\n", pp, p, *pp);
+	}
+}
+
+// bv describes the memory starting at address scanp.
+// Adjust any pointers contained therein.
+static void
+adjustpointers(byte **scanp, BitVector *bv, AdjustInfo *adjinfo, Func *f)
+{
+	uintptr delta;
+	int32 num, i;
+	byte *p, *minp, *maxp;
+	Type *t;
+	Itab *tab;
+	
+	minp = (byte*)adjinfo->old.lo;
+	maxp = (byte*)adjinfo->old.hi;
+	delta = adjinfo->delta;
+	num = bv->n / BitsPerPointer;
+	for(i = 0; i < num; i++) {
+		if(StackDebug >= 4)
+			runtime·printf("        %p:%s:%p\n", &scanp[i], mapnames[bv->bytedata[i / (8 / BitsPerPointer)] >> (i * BitsPerPointer & 7) & 3], scanp[i]);
+		switch(bv->bytedata[i / (8 / BitsPerPointer)] >> (i * BitsPerPointer & 7) & 3) {
+		case BitsDead:
+			if(runtime·debug.gcdead)
+				scanp[i] = (byte*)PoisonStack;
+			break;
+		case BitsScalar:
+			break;
+		case BitsPointer:
+			p = scanp[i];
+			if(f != nil && (byte*)0 < p && (p < (byte*)PageSize && runtime·invalidptr || (uintptr)p == PoisonGC || (uintptr)p == PoisonStack)) {
+				// Looks like a junk value in a pointer slot.
+				// Live analysis wrong?
+				g->m->traceback = 2;
+				runtime·printf("runtime: bad pointer in frame %s at %p: %p\n", runtime·funcname(f), &scanp[i], p);
+				runtime·throw("invalid stack pointer");
+			}
+			if(minp <= p && p < maxp) {
+				if(StackDebug >= 3)
+					runtime·printf("adjust ptr %p %s\n", p, runtime·funcname(f));
+				scanp[i] = p + delta;
+			}
+			break;
+		case BitsMultiWord:
+			switch(bv->bytedata[(i+1) / (8 / BitsPerPointer)] >> ((i+1) * BitsPerPointer & 7) & 3) {
+			default:
+				runtime·throw("unexpected garbage collection bits");
+			case BitsEface:
+				t = (Type*)scanp[i];
+				if(t != nil && ((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0)) {
+					p = scanp[i+1];
+					if(minp <= p && p < maxp) {
+						if(StackDebug >= 3)
+							runtime·printf("adjust eface %p\n", p);
+						if(t->size > PtrSize) // currently we always allocate such objects on the heap
+							runtime·throw("large interface value found on stack");
+						scanp[i+1] = p + delta;
+					}
+				}
+				i++;
+				break;
+			case BitsIface:
+				tab = (Itab*)scanp[i];
+				if(tab != nil) {
+					t = tab->type;
+					//runtime·printf("          type=%p\n", t);
+					if((t->kind & KindDirectIface) == 0 || (t->kind & KindNoPointers) == 0) {
+						p = scanp[i+1];
+						if(minp <= p && p < maxp) {
+							if(StackDebug >= 3)
+								runtime·printf("adjust iface %p\n", p);
+							if(t->size > PtrSize) // currently we always allocate such objects on the heap
+								runtime·throw("large interface value found on stack");
+							scanp[i+1] = p + delta;
+						}
+					}
+				}
+				i++;
+				break;
+			}
+			break;
+		}
+	}
+}
+
+// Note: the argument/return area is adjusted by the callee.
+static bool
+adjustframe(Stkframe *frame, void *arg)
+{
+	AdjustInfo *adjinfo;
+	Func *f;
+	StackMap *stackmap;
+	int32 pcdata;
+	BitVector bv;
+	uintptr targetpc, size, minsize;
+
+	adjinfo = arg;
+	targetpc = frame->continpc;
+	if(targetpc == 0) {
+		// Frame is dead.
+		return true;
+	}
+	f = frame->fn;
+	if(StackDebug >= 2)
+		runtime·printf("    adjusting %s frame=[%p,%p] pc=%p continpc=%p\n", runtime·funcname(f), frame->sp, frame->fp, frame->pc, frame->continpc);
+	if(f->entry == (uintptr)runtime·switchtoM) {
+		// A special routine at the bottom of stack of a goroutine that does an onM call.
+		// We will allow it to be copied even though we don't
+		// have full GC info for it (because it is written in asm).
+		return true;
+	}
+	if(targetpc != f->entry)
+		targetpc--;
+	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
+	if(pcdata == -1)
+		pcdata = 0; // in prologue
+
+	// Adjust local variables if stack frame has been allocated.
+	size = frame->varp - frame->sp;
+	if(thechar != '6' && thechar != '8')
+		minsize = sizeof(uintptr);
+	else
+		minsize = 0;
+	if(size > minsize) {
+		stackmap = runtime·funcdata(f, FUNCDATA_LocalsPointerMaps);
+		if(stackmap == nil || stackmap->n <= 0) {
+			runtime·printf("runtime: frame %s untyped locals %p+%p\n", runtime·funcname(f), (byte*)(frame->varp-size), size);
+			runtime·throw("missing stackmap");
+		}
+		// Locals bitmap information, scan just the pointers in locals.
+		if(pcdata < 0 || pcdata >= stackmap->n) {
+			// don't know where we are
+			runtime·printf("runtime: pcdata is %d and %d locals stack map entries for %s (targetpc=%p)\n",
+				pcdata, stackmap->n, runtime·funcname(f), targetpc);
+			runtime·throw("bad symbol table");
+		}
+		bv = runtime·stackmapdata(stackmap, pcdata);
+		size = (bv.n * PtrSize) / BitsPerPointer;
+		if(StackDebug >= 3)
+			runtime·printf("      locals\n");
+		adjustpointers((byte**)(frame->varp - size), &bv, adjinfo, f);
+	}
+	
+	// Adjust arguments.
+	if(frame->arglen > 0) {
+		if(frame->argmap != nil) {
+			bv = *frame->argmap;
+		} else {
+			stackmap = runtime·funcdata(f, FUNCDATA_ArgsPointerMaps);
+			if(stackmap == nil || stackmap->n <= 0) {
+				runtime·printf("runtime: frame %s untyped args %p+%p\n", runtime·funcname(f), frame->argp, (uintptr)frame->arglen);
+				runtime·throw("missing stackmap");
+			}
+			if(pcdata < 0 || pcdata >= stackmap->n) {
+				// don't know where we are
+				runtime·printf("runtime: pcdata is %d and %d args stack map entries for %s (targetpc=%p)\n",
+					pcdata, stackmap->n, runtime·funcname(f), targetpc);
+				runtime·throw("bad symbol table");
+			}
+			bv = runtime·stackmapdata(stackmap, pcdata);
+		}
+		if(StackDebug >= 3)
+			runtime·printf("      args\n");
+		adjustpointers((byte**)frame->argp, &bv, adjinfo, nil);
+	}
+	
+	return true;
+}
+
+static void
+adjustctxt(G *gp, AdjustInfo *adjinfo)
+{
+	adjustpointer(adjinfo, &gp->sched.ctxt);
+}
+
+static void
+adjustdefers(G *gp, AdjustInfo *adjinfo)
+{
+	Defer *d;
+	bool (*cb)(Stkframe*, void*);
+
+	// Adjust defer argument blocks the same way we adjust active stack frames.
+	cb = adjustframe;
+	runtime·tracebackdefers(gp, &cb, adjinfo);
+
+	// Adjust pointers in the Defer structs.
+	// Defer structs themselves are never on the stack.
+	for(d = gp->defer; d != nil; d = d->link) {
+		adjustpointer(adjinfo, &d->fn);
+		adjustpointer(adjinfo, &d->argp);
+		adjustpointer(adjinfo, &d->panic);
+	}
+}
+
+static void
+adjustpanics(G *gp, AdjustInfo *adjinfo)
+{
+	// Panics are on stack and already adjusted.
+	// Update pointer to head of list in G.
+	adjustpointer(adjinfo, &gp->panic);
+}
+
+static void
+adjustsudogs(G *gp, AdjustInfo *adjinfo)
+{
+	SudoG *s;
+
+	// the data elements pointed to by a SudoG structure
+	// might be in the stack.
+	for(s = gp->waiting; s != nil; s = s->waitlink) {
+		adjustpointer(adjinfo, &s->elem);
+		adjustpointer(adjinfo, &s->selectdone);
+	}
+}
+
+// Copies gp's stack to a new stack of a different size.
+static void
+copystack(G *gp, uintptr newsize)
+{
+	Stack old, new;
+	uintptr used;
+	AdjustInfo adjinfo;
+	uint32 oldstatus;
+	bool (*cb)(Stkframe*, void*);
+	byte *p, *ep;
+
+	if(gp->syscallsp != 0)
+		runtime·throw("stack growth not allowed in system call");
+	old = gp->stack;
+	if(old.lo == 0)
+		runtime·throw("nil stackbase");
+	used = old.hi - gp->sched.sp;
+
+	// allocate new stack
+	new = runtime·stackalloc(newsize);
+	if(StackPoisonCopy) {
+		p = (byte*)new.lo;
+		ep = (byte*)new.hi;
+		while(p < ep)
+			*p++ = 0xfd;
+	}
+
+	if(StackDebug >= 1)
+		runtime·printf("copystack gp=%p [%p %p %p]/%d -> [%p %p %p]/%d\n", gp, old.lo, old.hi-used, old.hi, (int32)(old.hi-old.lo), new.lo, new.hi-used, new.hi, (int32)newsize);
+	
+	// adjust pointers in the to-be-copied frames
+	adjinfo.old = old;
+	adjinfo.delta = new.hi - old.hi;
+	cb = adjustframe;
+	runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, 0, nil, 0x7fffffff, &cb, &adjinfo, 0);
+	
+	// adjust other miscellaneous things that have pointers into stacks.
+	adjustctxt(gp, &adjinfo);
+	adjustdefers(gp, &adjinfo);
+	adjustpanics(gp, &adjinfo);
+	adjustsudogs(gp, &adjinfo);
+	
+	// copy the stack to the new location
+	if(StackPoisonCopy) {
+		p = (byte*)new.lo;
+		ep = (byte*)new.hi;
+		while(p < ep)
+			*p++ = 0xfb;
+	}
+	runtime·memmove((byte*)new.hi - used, (byte*)old.hi - used, used);
+
+	oldstatus = runtime·casgcopystack(gp); // cas from Gwaiting or Grunnable to Gcopystack, return old status
+
+	// Swap out old stack for new one
+	gp->stack = new;
+	gp->stackguard0 = new.lo + StackGuard; // NOTE: might clobber a preempt request
+	gp->sched.sp = new.hi - used;
+
+	runtime·casgstatus(gp, Gcopystack, oldstatus); // oldstatus is Gwaiting or Grunnable
+
+	// free old stack
+	if(StackPoisonCopy) {
+		p = (byte*)old.lo;
+		ep = (byte*)old.hi;
+		while(p < ep)
+			*p++ = 0xfc;
+	}
+	if(newsize > old.hi-old.lo) {
+		// growing, free stack immediately
+		runtime·stackfree(old);
+	} else {
+		// shrinking, queue up free operation.  We can't actually free the stack
+		// just yet because we might run into the following situation:
+		// 1) GC starts, scans a SudoG but does not yet mark the SudoG.elem pointer
+		// 2) The stack that pointer points to is shrunk
+		// 3) The old stack is freed
+		// 4) The containing span is marked free
+		// 5) GC attempts to mark the SudoG.elem pointer.  The marking fails because
+		//    the pointer looks like a pointer into a free span.
+		// By not freeing, we prevent step #4 until GC is done.
+		runtime·lock(&runtime·stackpoolmu);
+		*(Stack*)old.lo = stackfreequeue;
+		stackfreequeue = old;
+		runtime·unlock(&runtime·stackpoolmu);
+	}
+}
+
+// round x up to a power of 2.
+int32
+runtime·round2(int32 x)
+{
+	int32 s;
+
+	s = 0;
+	while((1 << s) < x)
+		s++;
+	return 1 << s;
+}
+
+// Called from runtime·morestack when more stack is needed.
+// Allocate larger stack and relocate to new stack.
+// Stack growth is multiplicative, for constant amortized cost.
+//
+// g->atomicstatus will be Grunning or Gscanrunning upon entry. 
+// If the GC is trying to stop this g then it will set preemptscan to true.
+void
+runtime·newstack(void)
+{
+	int32 oldsize, newsize;
+	uintptr sp;
+	G *gp;
+	Gobuf morebuf;
+
+	if(g->m->morebuf.g->stackguard0 == (uintptr)StackFork)
+		runtime·throw("stack growth after fork");
+	if(g->m->morebuf.g != g->m->curg) {
+		runtime·printf("runtime: newstack called from g=%p\n"
+			"\tm=%p m->curg=%p m->g0=%p m->gsignal=%p\n",
+			g->m->morebuf.g, g->m, g->m->curg, g->m->g0, g->m->gsignal);
+		morebuf = g->m->morebuf;
+		runtime·traceback(morebuf.pc, morebuf.sp, morebuf.lr, morebuf.g);
+		runtime·throw("runtime: wrong goroutine in newstack");
+	}
+	if(g->m->curg->throwsplit)
+		runtime·throw("runtime: stack split at bad time");
+
+	// The goroutine must be executing in order to call newstack,
+	// so it must be Grunning or Gscanrunning.
+
+	gp = g->m->curg;
+	morebuf = g->m->morebuf;
+	g->m->morebuf.pc = (uintptr)nil;
+	g->m->morebuf.lr = (uintptr)nil;
+	g->m->morebuf.sp = (uintptr)nil;
+	g->m->morebuf.g = (G*)nil;
+
+	runtime·casgstatus(gp, Grunning, Gwaiting);
+	gp->waitreason = runtime·gostringnocopy((byte*)"stack growth");
+
+	runtime·rewindmorestack(&gp->sched);
+
+	if(gp->stack.lo == 0)
+		runtime·throw("missing stack in newstack");
+	sp = gp->sched.sp;
+	if(thechar == '6' || thechar == '8') {
+		// The call to morestack cost a word.
+		sp -= sizeof(uintreg);
+	}
+	if(StackDebug >= 1 || sp < gp->stack.lo) {
+		runtime·printf("runtime: newstack sp=%p stack=[%p, %p]\n"
+			"\tmorebuf={pc:%p sp:%p lr:%p}\n"
+			"\tsched={pc:%p sp:%p lr:%p ctxt:%p}\n",
+			sp, gp->stack.lo, gp->stack.hi,
+			g->m->morebuf.pc, g->m->morebuf.sp, g->m->morebuf.lr,
+			gp->sched.pc, gp->sched.sp, gp->sched.lr, gp->sched.ctxt);
+	}
+	if(sp < gp->stack.lo) {
+		runtime·printf("runtime: gp=%p, gp->status=%d\n ", (void*)gp, runtime·readgstatus(gp));
+		runtime·printf("runtime: split stack overflow: %p < %p\n", sp, gp->stack.lo);
+		runtime·throw("runtime: split stack overflow");
+	}
+
+	if(gp->stackguard0 == (uintptr)StackPreempt) {
+		if(gp == g->m->g0)
+			runtime·throw("runtime: preempt g0");
+		if(g->m->p == nil && g->m->locks == 0)
+			runtime·throw("runtime: g is running but p is not");
+		if(gp->preemptscan) {
+			runtime·gcphasework(gp);
+			runtime·casgstatus(gp, Gwaiting, Grunning);
+			gp->stackguard0 = gp->stack.lo + StackGuard;
+			gp->preempt = false; 
+			gp->preemptscan = false;        // Tells the GC premption was successful.
+			runtime·gogo(&gp->sched);	// never return 
+		}
+
+		// Be conservative about where we preempt.
+		// We are interested in preempting user Go code, not runtime code.
+		if(g->m->locks || g->m->mallocing || g->m->gcing || g->m->p->status != Prunning) {
+			// Let the goroutine keep running for now.
+			// gp->preempt is set, so it will be preempted next time.
+			gp->stackguard0 = gp->stack.lo + StackGuard;
+			runtime·casgstatus(gp, Gwaiting, Grunning);
+			runtime·gogo(&gp->sched);	// never return
+		}
+		// Act like goroutine called runtime.Gosched.
+		runtime·casgstatus(gp, Gwaiting, Grunning);
+		runtime·gosched_m(gp);	// never return
+	}
+
+	// Allocate a bigger segment and move the stack.
+	oldsize = gp->stack.hi - gp->stack.lo;
+	newsize = oldsize * 2;
+	if(newsize > runtime·maxstacksize) {
+		runtime·printf("runtime: goroutine stack exceeds %D-byte limit\n", (uint64)runtime·maxstacksize);
+		runtime·throw("stack overflow");
+	}
+
+	// Note that the concurrent GC might be scanning the stack as we try to replace it.
+	// copystack takes care of the appropriate coordination with the stack scanner.
+	copystack(gp, newsize);
+	if(StackDebug >= 1)
+		runtime·printf("stack grow done\n");
+	runtime·casgstatus(gp, Gwaiting, Grunning);
+	runtime·gogo(&gp->sched);
+}
+
+#pragma textflag NOSPLIT
+void
+runtime·nilfunc(void)
+{
+	*(byte*)0 = 0;
+}
+
+// adjust Gobuf as if it executed a call to fn
+// and then did an immediate gosave.
+void
+runtime·gostartcallfn(Gobuf *gobuf, FuncVal *fv)
+{
+	void *fn;
+
+	if(fv != nil)
+		fn = fv->fn;
+	else
+		fn = runtime·nilfunc;
+	runtime·gostartcall(gobuf, fn, fv);
+}
+
+// Maybe shrink the stack being used by gp.
+// Called at garbage collection time.
+void
+runtime·shrinkstack(G *gp)
+{
+	uintptr used, oldsize, newsize;
+
+	if(runtime·readgstatus(gp) == Gdead) {
+		if(gp->stack.lo != 0) {
+			// Free whole stack - it will get reallocated
+			// if G is used again.
+			runtime·stackfree(gp->stack);
+			gp->stack.lo = 0;
+			gp->stack.hi = 0;
+		}
+		return;
+	}
+	if(gp->stack.lo == 0)
+		runtime·throw("missing stack in shrinkstack");
+
+	oldsize = gp->stack.hi - gp->stack.lo;
+	newsize = oldsize / 2;
+	if(newsize < FixedStack)
+		return; // don't shrink below the minimum-sized stack
+	used = gp->stack.hi - gp->sched.sp;
+	if(used >= oldsize / 4)
+		return; // still using at least 1/4 of the segment.
+
+	// We can't copy the stack if we're in a syscall.
+	// The syscall might have pointers into the stack.
+	if(gp->syscallsp != 0)
+		return;
+
+#ifdef GOOS_windows
+	if(gp->m != nil && gp->m->libcallsp != 0)
+		return;
+#endif
+	if(StackDebug > 0)
+		runtime·printf("shrinking stack %D->%D\n", (uint64)oldsize, (uint64)newsize);
+	copystack(gp, newsize);
+}
+
+// Do any delayed stack freeing that was queued up during GC.
+void
+runtime·shrinkfinish(void)
+{
+	Stack s, t;
+
+	runtime·lock(&runtime·stackpoolmu);
+	s = stackfreequeue;
+	stackfreequeue = (Stack){0,0};
+	runtime·unlock(&runtime·stackpoolmu);
+	while(s.lo != 0) {
+		t = *(Stack*)s.lo;
+		runtime·stackfree(s);
+		s = t;
+	}
+}
+
+static void badc(void);
+
+#pragma textflag NOSPLIT
+void
+runtime·morestackc(void)
+{
+	void (*fn)(void);
+	
+	fn = badc;
+	runtime·onM(&fn);
+}
+
+static void
+badc(void)
+{
+	runtime·throw("attempt to execute C code on Go stack");
+}
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
new file mode 100644
index 000000000..f1b7d32d2
--- /dev/null
+++ b/src/runtime/stack.go
@@ -0,0 +1,13 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	// Goroutine preemption request.
+	// Stored into g->stackguard0 to cause split stack check failure.
+	// Must be greater than any real sp.
+	// 0xfffffade in hex.
+	stackPreempt = ^uintptr(1313)
+)
diff --git a/src/runtime/stack.h b/src/runtime/stack.h
new file mode 100644
index 000000000..f97dc4ed8
--- /dev/null
+++ b/src/runtime/stack.h
@@ -0,0 +1,118 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Stack layout parameters.
+Included both by runtime (compiled via 6c) and linkers (compiled via gcc).
+
+The per-goroutine g->stackguard is set to point StackGuard bytes
+above the bottom of the stack.  Each function compares its stack
+pointer against g->stackguard to check for overflow.  To cut one
+instruction from the check sequence for functions with tiny frames,
+the stack is allowed to protrude StackSmall bytes below the stack
+guard.  Functions with large frames don't bother with the check and
+always call morestack.  The sequences are (for amd64, others are
+similar):
+ 
+	guard = g->stackguard
+	frame = function's stack frame size
+	argsize = size of function arguments (call + return)
+
+	stack frame size <= StackSmall:
+		CMPQ guard, SP
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size > StackSmall but < StackBig
+		LEAQ (frame-StackSmall)(SP), R0
+		CMPQ guard, R0
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size >= StackBig:
+		MOVQ m->morearg, $((argsize << 32) | frame)
+		CALL morestack(SB)
+
+The bottom StackGuard - StackSmall bytes are important: there has
+to be enough room to execute functions that refuse to check for
+stack overflow, either because they need to be adjacent to the
+actual caller's frame (deferproc) or because they handle the imminent
+stack overflow (morestack).
+
+For example, deferproc might call malloc, which does one of the
+above checks (without allocating a full frame), which might trigger
+a call to morestack.  This sequence needs to fit in the bottom
+section of the stack.  On amd64, morestack's frame is 40 bytes, and
+deferproc's frame is 56 bytes.  That fits well within the
+StackGuard - StackSmall bytes at the bottom.  
+The linkers explore all possible call traces involving non-splitting
+functions to make sure that this limit cannot be violated.
+ */
+
+enum {
+	// StackSystem is a number of additional bytes to add
+	// to each stack below the usual guard area for OS-specific
+	// purposes like signal handling. Used on Windows and on
+	// Plan 9 because they do not use a separate stack.
+#ifdef GOOS_windows
+	StackSystem = 512 * sizeof(uintptr),
+#else
+#ifdef GOOS_plan9
+	// The size of the note handler frame varies among architectures,
+	// but 512 bytes should be enough for every implementation.
+	StackSystem = 512,
+#else
+	StackSystem = 0,
+#endif	// Plan 9
+#endif	// Windows
+
+	// The minimum size of stack used by Go code
+	StackMin = 2048,
+
+	// The minimum stack size to allocate.
+	// The hackery here rounds FixedStack0 up to a power of 2.
+	FixedStack0 = StackMin + StackSystem,
+	FixedStack1 = FixedStack0 - 1,
+	FixedStack2 = FixedStack1 | (FixedStack1 >> 1),
+	FixedStack3 = FixedStack2 | (FixedStack2 >> 2),
+	FixedStack4 = FixedStack3 | (FixedStack3 >> 4),
+	FixedStack5 = FixedStack4 | (FixedStack4 >> 8),
+	FixedStack6 = FixedStack5 | (FixedStack5 >> 16),
+	FixedStack = FixedStack6 + 1,
+
+	// Functions that need frames bigger than this use an extra
+	// instruction to do the stack split check, to avoid overflow
+	// in case SP - framesize wraps below zero.
+	// This value can be no bigger than the size of the unmapped
+	// space at zero.
+	StackBig = 4096,
+
+	// The stack guard is a pointer this many bytes above the
+	// bottom of the stack.
+	StackGuard = 512 + StackSystem,
+
+	// After a stack split check the SP is allowed to be this
+	// many bytes below the stack guard.  This saves an instruction
+	// in the checking sequence for tiny frames.
+	StackSmall = 128,
+
+	// The maximum number of bytes that a chain of NOSPLIT
+	// functions can use.
+	StackLimit = StackGuard - StackSystem - StackSmall,
+};
+
+// Goroutine preemption request.
+// Stored into g->stackguard0 to cause split stack check failure.
+// Must be greater than any real sp.
+// 0xfffffade in hex.
+#define StackPreempt ((uint64)-1314)
+/*c2go
+enum
+{
+	StackPreempt = -1314,
+};
+*/
+#define StackFork ((uint64)-1234)
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
new file mode 100644
index 000000000..652c72eee
--- /dev/null
+++ b/src/runtime/stack_test.go
@@ -0,0 +1,397 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+)
+
+// TestStackMem measures per-thread stack segment cache behavior.
+// The test consumed up to 500MB in the past.
+func TestStackMem(t *testing.T) {
+	const (
+		BatchSize      = 32
+		BatchCount     = 256
+		ArraySize      = 1024
+		RecursionDepth = 128
+	)
+	if testing.Short() {
+		return
+	}
+	defer GOMAXPROCS(GOMAXPROCS(BatchSize))
+	s0 := new(MemStats)
+	ReadMemStats(s0)
+	for b := 0; b < BatchCount; b++ {
+		c := make(chan bool, BatchSize)
+		for i := 0; i < BatchSize; i++ {
+			go func() {
+				var f func(k int, a [ArraySize]byte)
+				f = func(k int, a [ArraySize]byte) {
+					if k == 0 {
+						time.Sleep(time.Millisecond)
+						return
+					}
+					f(k-1, a)
+				}
+				f(RecursionDepth, [ArraySize]byte{})
+				c <- true
+			}()
+		}
+		for i := 0; i < BatchSize; i++ {
+			<-c
+		}
+
+		// The goroutines have signaled via c that they are ready to exit.
+		// Give them a chance to exit by sleeping. If we don't wait, we
+		// might not reuse them on the next batch.
+		time.Sleep(10 * time.Millisecond)
+	}
+	s1 := new(MemStats)
+	ReadMemStats(s1)
+	consumed := int64(s1.StackSys - s0.StackSys)
+	t.Logf("Consumed %vMB for stack mem", consumed>>20)
+	estimate := int64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
+	if consumed > estimate {
+		t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
+	}
+	// Due to broken stack memory accounting (http://golang.org/issue/7468),
+	// StackInuse can decrease during function execution, so we cast the values to int64.
+	inuse := int64(s1.StackInuse) - int64(s0.StackInuse)
+	t.Logf("Inuse %vMB for stack mem", inuse>>20)
+	if inuse > 4<<20 {
+		t.Fatalf("Stack inuse: want %v, got %v", 4<<20, inuse)
+	}
+}
+
+// Test stack growing in different contexts.
+func TestStackGrowth(t *testing.T) {
+	t.Parallel()
+	var wg sync.WaitGroup
+
+	// in a normal goroutine
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		growStack()
+	}()
+	wg.Wait()
+
+	// in locked goroutine
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		LockOSThread()
+		growStack()
+		UnlockOSThread()
+	}()
+	wg.Wait()
+
+	// in finalizer
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		done := make(chan bool)
+		go func() {
+			s := new(string)
+			SetFinalizer(s, func(ss *string) {
+				growStack()
+				done <- true
+			})
+			s = nil
+			done <- true
+		}()
+		<-done
+		GC()
+		select {
+		case <-done:
+		case <-time.After(20 * time.Second):
+			t.Fatal("finalizer did not run")
+		}
+	}()
+	wg.Wait()
+}
+
+// ... and in init
+//func init() {
+//	growStack()
+//}
+
+func growStack() {
+	n := 1 << 10
+	if testing.Short() {
+		n = 1 << 8
+	}
+	for i := 0; i < n; i++ {
+		x := 0
+		growStackIter(&x, i)
+		if x != i+1 {
+			panic("stack is corrupted")
+		}
+	}
+	GC()
+}
+
+// This function is not an anonymous func, so that the compiler can do escape
+// analysis and place x on stack (and subsequently stack growth update the pointer).
+func growStackIter(p *int, n int) {
+	if n == 0 {
+		*p = n + 1
+		GC()
+		return
+	}
+	*p = n + 1
+	x := 0
+	growStackIter(&x, n-1)
+	if x != n {
+		panic("stack is corrupted")
+	}
+}
+
+func TestStackGrowthCallback(t *testing.T) {
+	t.Parallel()
+	var wg sync.WaitGroup
+
+	// test stack growth at chan op
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c := make(chan int, 1)
+		growStackWithCallback(func() {
+			c <- 1
+			<-c
+		})
+	}()
+
+	// test stack growth at map op
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		m := make(map[int]int)
+		growStackWithCallback(func() {
+			_, _ = m[1]
+			m[1] = 1
+		})
+	}()
+
+	// test stack growth at goroutine creation
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		growStackWithCallback(func() {
+			done := make(chan bool)
+			go func() {
+				done <- true
+			}()
+			<-done
+		})
+	}()
+
+	wg.Wait()
+}
+
+func growStackWithCallback(cb func()) {
+	var f func(n int)
+	f = func(n int) {
+		if n == 0 {
+			cb()
+			return
+		}
+		f(n - 1)
+	}
+	for i := 0; i < 1<<10; i++ {
+		f(i)
+	}
+}
+
+// TestDeferPtrs tests the adjustment of Defer's argument pointers (p aka &y)
+// during a stack copy.
+func set(p *int, x int) {
+	*p = x
+}
+func TestDeferPtrs(t *testing.T) {
+	var y int
+
+	defer func() {
+		if y != 42 {
+			t.Errorf("defer's stack references were not adjusted appropriately")
+		}
+	}()
+	defer set(&y, 42)
+	growStack()
+}
+
+type bigBuf [4 * 1024]byte
+
+// TestDeferPtrsGoexit is like TestDeferPtrs but exercises the possibility that the
+// stack grows as part of starting the deferred function. It calls Goexit at various
+// stack depths, forcing the deferred function (with >4kB of args) to be run at
+// the bottom of the stack. The goal is to find a stack depth less than 4kB from
+// the end of the stack. Each trial runs in a different goroutine so that an earlier
+// stack growth does not invalidate a later attempt.
+func TestDeferPtrsGoexit(t *testing.T) {
+	for i := 0; i < 100; i++ {
+		c := make(chan int, 1)
+		go testDeferPtrsGoexit(c, i)
+		if n := <-c; n != 42 {
+			t.Fatalf("defer's stack references were not adjusted appropriately (i=%d n=%d)", i, n)
+		}
+	}
+}
+
+func testDeferPtrsGoexit(c chan int, i int) {
+	var y int
+	defer func() {
+		c <- y
+	}()
+	defer setBig(&y, 42, bigBuf{})
+	useStackAndCall(i, Goexit)
+}
+
+func setBig(p *int, x int, b bigBuf) {
+	*p = x
+}
+
+// TestDeferPtrsPanic is like TestDeferPtrsGoexit, but it's using panic instead
+// of Goexit to run the Defers. Those two are different execution paths
+// in the runtime.
+func TestDeferPtrsPanic(t *testing.T) {
+	for i := 0; i < 100; i++ {
+		c := make(chan int, 1)
+		go testDeferPtrsGoexit(c, i)
+		if n := <-c; n != 42 {
+			t.Fatalf("defer's stack references were not adjusted appropriately (i=%d n=%d)", i, n)
+		}
+	}
+}
+
+func testDeferPtrsPanic(c chan int, i int) {
+	var y int
+	defer func() {
+		if recover() == nil {
+			c <- -1
+			return
+		}
+		c <- y
+	}()
+	defer setBig(&y, 42, bigBuf{})
+	useStackAndCall(i, func() { panic(1) })
+}
+
+// TestPanicUseStack checks that a chain of Panic structs on the stack are
+// updated correctly if the stack grows during the deferred execution that
+// happens as a result of the panic.
+func TestPanicUseStack(t *testing.T) {
+	pc := make([]uintptr, 10000)
+	defer func() {
+		recover()
+		Callers(0, pc) // force stack walk
+		useStackAndCall(100, func() {
+			defer func() {
+				recover()
+				Callers(0, pc) // force stack walk
+				useStackAndCall(200, func() {
+					defer func() {
+						recover()
+						Callers(0, pc) // force stack walk
+					}()
+					panic(3)
+				})
+			}()
+			panic(2)
+		})
+	}()
+	panic(1)
+}
+
+// use about n KB of stack and call f
+func useStackAndCall(n int, f func()) {
+	if n == 0 {
+		f()
+		return
+	}
+	var b [1024]byte // makes frame about 1KB
+	useStackAndCall(n-1+int(b[99]), f)
+}
+
+func useStack(n int) {
+	useStackAndCall(n, func() {})
+}
+
+func growing(c chan int, done chan struct{}) {
+	for n := range c {
+		useStack(n)
+		done <- struct{}{}
+	}
+	done <- struct{}{}
+}
+
+func TestStackCache(t *testing.T) {
+	// Allocate a bunch of goroutines and grow their stacks.
+	// Repeat a few times to test the stack cache.
+	const (
+		R = 4
+		G = 200
+		S = 5
+	)
+	for i := 0; i < R; i++ {
+		var reqchans [G]chan int
+		done := make(chan struct{})
+		for j := 0; j < G; j++ {
+			reqchans[j] = make(chan int)
+			go growing(reqchans[j], done)
+		}
+		for s := 0; s < S; s++ {
+			for j := 0; j < G; j++ {
+				reqchans[j] <- 1 << uint(s)
+			}
+			for j := 0; j < G; j++ {
+				<-done
+			}
+		}
+		for j := 0; j < G; j++ {
+			close(reqchans[j])
+		}
+		for j := 0; j < G; j++ {
+			<-done
+		}
+	}
+}
+
+func TestStackOutput(t *testing.T) {
+	b := make([]byte, 1024)
+	stk := string(b[:Stack(b, false)])
+	if !strings.HasPrefix(stk, "goroutine ") {
+		t.Errorf("Stack (len %d):\n%s", len(stk), stk)
+		t.Errorf("Stack output should begin with \"goroutine \"")
+	}
+}
+
+func TestStackAllOutput(t *testing.T) {
+	b := make([]byte, 1024)
+	stk := string(b[:Stack(b, true)])
+	if !strings.HasPrefix(stk, "goroutine ") {
+		t.Errorf("Stack (len %d):\n%s", len(stk), stk)
+		t.Errorf("Stack output should begin with \"goroutine \"")
+	}
+}
+
+func TestStackPanic(t *testing.T) {
+	// Test that stack copying copies panics correctly.  This is difficult
+	// to test because it is very unlikely that the stack will be copied
+	// in the middle of gopanic.  But it can happen.
+	// To make this test effective, edit panic.go:gopanic and uncomment
+	// the GC() call just before freedefer(d).
+	defer func() {
+		if x := recover(); x == nil {
+			t.Errorf("recover failed")
+		}
+	}()
+	useStack(32)
+	panic("test panic")
+}
diff --git a/src/runtime/string.c b/src/runtime/string.c
new file mode 100644
index 000000000..ed5debc33
--- /dev/null
+++ b/src/runtime/string.c
@@ -0,0 +1,226 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "race.h"
+#include "textflag.h"
+
+String	runtime·emptystring;
+
+#pragma textflag NOSPLIT
+intgo
+runtime·findnull(byte *s)
+{
+	intgo l;
+
+	if(s == nil)
+		return 0;
+	for(l=0; s[l]!=0; l++)
+		;
+	return l;
+}
+
+intgo
+runtime·findnullw(uint16 *s)
+{
+	intgo l;
+
+	if(s == nil)
+		return 0;
+	for(l=0; s[l]!=0; l++)
+		;
+	return l;
+}
+
+uintptr runtime·maxstring = 256; // a hint for print
+
+#pragma textflag NOSPLIT
+String
+runtime·gostringnocopy(byte *str)
+{
+	String s;
+	uintptr ms;
+	
+	s.str = str;
+	s.len = runtime·findnull(str);
+	while(true) {
+		ms = runtime·maxstring;
+		if(s.len <= ms || runtime·casp((void**)&runtime·maxstring, (void*)ms, (void*)s.len))
+			return s;
+	}
+}
+
+// TODO: move this elsewhere
+enum
+{
+	Bit1	= 7,
+	Bitx	= 6,
+	Bit2	= 5,
+	Bit3	= 4,
+	Bit4	= 3,
+	Bit5	= 2,
+
+	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
+	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
+	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
+	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+
+	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
+
+	Runeerror	= 0xFFFD,
+
+	SurrogateMin = 0xD800,
+	SurrogateMax = 0xDFFF,
+
+	Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+static int32
+runetochar(byte *str, int32 rune)  /* note: in original, arg2 was pointer */
+{
+	/* Runes are signed, so convert to unsigned for range check. */
+	uint32 c;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	c = rune;
+	if(c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	if(c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/*
+	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+	if (SurrogateMin <= c && c <= SurrogateMax)
+		c = Runeerror;
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
+}
+
+String runtime·gostringsize(intgo);
+
+String
+runtime·gostringw(uint16 *str)
+{
+	intgo n1, n2, i;
+	byte buf[8];
+	String s;
+
+	n1 = 0;
+	for(i=0; str[i]; i++)
+		n1 += runetochar(buf, str[i]);
+	s = runtime·gostringsize(n1+4);
+	n2 = 0;
+	for(i=0; str[i]; i++) {
+		// check for race
+		if(n2 >= n1)
+			break;
+		n2 += runetochar(s.str+n2, str[i]);
+	}
+	s.len = n2;
+	s.str[s.len] = 0;
+	return s;
+}
+
+int32
+runtime·strcmp(byte *s1, byte *s2)
+{
+	uintptr i;
+	byte c1, c2;
+
+	for(i=0;; i++) {
+		c1 = s1[i];
+		c2 = s2[i];
+		if(c1 < c2)
+			return -1;
+		if(c1 > c2)
+			return +1;
+		if(c1 == 0)
+			return 0;
+	}
+}
+
+int32
+runtime·strncmp(byte *s1, byte *s2, uintptr n)
+{
+	uintptr i;
+	byte c1, c2;
+
+	for(i=0; i<n; i++) {
+		c1 = s1[i];
+		c2 = s2[i];
+		if(c1 < c2)
+			return -1;
+		if(c1 > c2)
+			return +1;
+		if(c1 == 0)
+			break;
+	}
+	return 0;
+}
+
+byte*
+runtime·strstr(byte *s1, byte *s2)
+{
+	byte *sp1, *sp2;
+
+	if(*s2 == 0)
+		return s1;
+	for(; *s1; s1++) {
+		if(*s1 != *s2)
+			continue;
+		sp1 = s1;
+		sp2 = s2;
+		for(;;) {
+			if(*sp2 == 0)
+				return s1;
+			if(*sp1++ != *sp2++)
+				break;
+		}
+	}
+	return nil;
+}
diff --git a/src/runtime/string.go b/src/runtime/string.go
new file mode 100644
index 000000000..0809f89bc
--- /dev/null
+++ b/src/runtime/string.go
@@ -0,0 +1,298 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+func concatstrings(a []string) string {
+	idx := 0
+	l := 0
+	count := 0
+	for i, x := range a {
+		n := len(x)
+		if n == 0 {
+			continue
+		}
+		if l+n < l {
+			gothrow("string concatenation too long")
+		}
+		l += n
+		count++
+		idx = i
+	}
+	if count == 0 {
+		return ""
+	}
+	if count == 1 {
+		return a[idx]
+	}
+	s, b := rawstring(l)
+	l = 0
+	for _, x := range a {
+		copy(b[l:], x)
+		l += len(x)
+	}
+	return s
+}
+
+//go:nosplit
+func concatstring2(a [2]string) string {
+	return concatstrings(a[:])
+}
+
+//go:nosplit
+func concatstring3(a [3]string) string {
+	return concatstrings(a[:])
+}
+
+//go:nosplit
+func concatstring4(a [4]string) string {
+	return concatstrings(a[:])
+}
+
+//go:nosplit
+func concatstring5(a [5]string) string {
+	return concatstrings(a[:])
+}
+
+func slicebytetostring(b []byte) string {
+	if raceenabled && len(b) > 0 {
+		racereadrangepc(unsafe.Pointer(&b[0]),
+			uintptr(len(b)),
+			getcallerpc(unsafe.Pointer(&b)),
+			funcPC(slicebytetostring))
+	}
+	s, c := rawstring(len(b))
+	copy(c, b)
+	return s
+}
+
+func slicebytetostringtmp(b []byte) string {
+	// Return a "string" referring to the actual []byte bytes.
+	// This is only for use by internal compiler optimizations
+	// that know that the string form will be discarded before
+	// the calling goroutine could possibly modify the original
+	// slice or synchronize with another goroutine.
+	// Today, the only such case is a m[string(k)] lookup where
+	// m is a string-keyed map and k is a []byte.
+
+	if raceenabled && len(b) > 0 {
+		racereadrangepc(unsafe.Pointer(&b[0]),
+			uintptr(len(b)),
+			getcallerpc(unsafe.Pointer(&b)),
+			funcPC(slicebytetostringtmp))
+	}
+	return *(*string)(unsafe.Pointer(&b))
+}
+
+func stringtoslicebyte(s string) []byte {
+	b := rawbyteslice(len(s))
+	copy(b, s)
+	return b
+}
+
+func stringtoslicerune(s string) []rune {
+	// two passes.
+	// unlike slicerunetostring, no race because strings are immutable.
+	n := 0
+	t := s
+	for len(s) > 0 {
+		_, k := charntorune(s)
+		s = s[k:]
+		n++
+	}
+	a := rawruneslice(n)
+	n = 0
+	for len(t) > 0 {
+		r, k := charntorune(t)
+		t = t[k:]
+		a[n] = r
+		n++
+	}
+	return a
+}
+
+func slicerunetostring(a []rune) string {
+	if raceenabled && len(a) > 0 {
+		racereadrangepc(unsafe.Pointer(&a[0]),
+			uintptr(len(a))*unsafe.Sizeof(a[0]),
+			getcallerpc(unsafe.Pointer(&a)),
+			funcPC(slicerunetostring))
+	}
+	var dum [4]byte
+	size1 := 0
+	for _, r := range a {
+		size1 += runetochar(dum[:], r)
+	}
+	s, b := rawstring(size1 + 3)
+	size2 := 0
+	for _, r := range a {
+		// check for race
+		if size2 >= size1 {
+			break
+		}
+		size2 += runetochar(b[size2:], r)
+	}
+	return s[:size2]
+}
+
+type stringStruct struct {
+	str unsafe.Pointer
+	len int
+}
+
+func intstring(v int64) string {
+	s, b := rawstring(4)
+	n := runetochar(b, rune(v))
+	return s[:n]
+}
+
+// stringiter returns the index of the next
+// rune after the rune that starts at s[k].
+func stringiter(s string, k int) int {
+	if k >= len(s) {
+		// 0 is end of iteration
+		return 0
+	}
+
+	c := s[k]
+	if c < runeself {
+		return k + 1
+	}
+
+	// multi-char rune
+	_, n := charntorune(s[k:])
+	return k + n
+}
+
+// stringiter2 returns the rune that starts at s[k]
+// and the index where the next rune starts.
+func stringiter2(s string, k int) (int, rune) {
+	if k >= len(s) {
+		// 0 is end of iteration
+		return 0, 0
+	}
+
+	c := s[k]
+	if c < runeself {
+		return k + 1, rune(c)
+	}
+
+	// multi-char rune
+	r, n := charntorune(s[k:])
+	return k + n, r
+}
+
+// rawstring allocates storage for a new string. The returned
+// string and byte slice both refer to the same storage.
+// The storage is not zeroed. Callers should use
+// b to set the string contents and then drop b.
+func rawstring(size int) (s string, b []byte) {
+	p := mallocgc(uintptr(size), nil, flagNoScan|flagNoZero)
+
+	(*stringStruct)(unsafe.Pointer(&s)).str = p
+	(*stringStruct)(unsafe.Pointer(&s)).len = size
+
+	(*slice)(unsafe.Pointer(&b)).array = (*uint8)(p)
+	(*slice)(unsafe.Pointer(&b)).len = uint(size)
+	(*slice)(unsafe.Pointer(&b)).cap = uint(size)
+
+	for {
+		ms := maxstring
+		if uintptr(size) <= uintptr(ms) || casuintptr((*uintptr)(unsafe.Pointer(&maxstring)), uintptr(ms), uintptr(size)) {
+			return
+		}
+	}
+}
+
+// rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
+func rawbyteslice(size int) (b []byte) {
+	cap := goroundupsize(uintptr(size))
+	p := mallocgc(cap, nil, flagNoScan|flagNoZero)
+	if cap != uintptr(size) {
+		memclr(add(p, uintptr(size)), cap-uintptr(size))
+	}
+
+	(*slice)(unsafe.Pointer(&b)).array = (*uint8)(p)
+	(*slice)(unsafe.Pointer(&b)).len = uint(size)
+	(*slice)(unsafe.Pointer(&b)).cap = uint(cap)
+	return
+}
+
+// rawruneslice allocates a new rune slice. The rune slice is not zeroed.
+func rawruneslice(size int) (b []rune) {
+	if uintptr(size) > maxmem/4 {
+		gothrow("out of memory")
+	}
+	mem := goroundupsize(uintptr(size) * 4)
+	p := mallocgc(mem, nil, flagNoScan|flagNoZero)
+	if mem != uintptr(size)*4 {
+		memclr(add(p, uintptr(size)*4), mem-uintptr(size)*4)
+	}
+
+	(*slice)(unsafe.Pointer(&b)).array = (*uint8)(p)
+	(*slice)(unsafe.Pointer(&b)).len = uint(size)
+	(*slice)(unsafe.Pointer(&b)).cap = uint(mem / 4)
+	return
+}
+
+// used by cmd/cgo
+func gobytes(p *byte, n int) []byte {
+	if n == 0 {
+		return make([]byte, 0)
+	}
+	x := make([]byte, n)
+	memmove(unsafe.Pointer(&x[0]), unsafe.Pointer(p), uintptr(n))
+	return x
+}
+
+func gostringsize(n int) string {
+	s, _ := rawstring(n)
+	return s
+}
+
+//go:noescape
+func findnull(*byte) int
+
+func gostring(p *byte) string {
+	l := findnull(p)
+	if l == 0 {
+		return ""
+	}
+	s, b := rawstring(l)
+	memmove(unsafe.Pointer(&b[0]), unsafe.Pointer(p), uintptr(l))
+	return s
+}
+
+func gostringn(p *byte, l int) string {
+	if l == 0 {
+		return ""
+	}
+	s, b := rawstring(l)
+	memmove(unsafe.Pointer(&b[0]), unsafe.Pointer(p), uintptr(l))
+	return s
+}
+
+func index(s, t string) int {
+	if len(t) == 0 {
+		return 0
+	}
+	for i := 0; i < len(s); i++ {
+		if s[i] == t[0] && hasprefix(s[i:], t) {
+			return i
+		}
+	}
+	return -1
+}
+
+func contains(s, t string) bool {
+	return index(s, t) >= 0
+}
+
+func hasprefix(s, t string) bool {
+	return len(s) >= len(t) && s[:len(t)] == t
+}
diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
new file mode 100644
index 000000000..1551ecc82
--- /dev/null
+++ b/src/runtime/string_test.go
@@ -0,0 +1,160 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func BenchmarkCompareStringEqual(b *testing.B) {
+	bytes := []byte("Hello Gophers!")
+	s1, s2 := string(bytes), string(bytes)
+	for i := 0; i < b.N; i++ {
+		if s1 != s2 {
+			b.Fatal("s1 != s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringIdentical(b *testing.B) {
+	s1 := "Hello Gophers!"
+	s2 := s1
+	for i := 0; i < b.N; i++ {
+		if s1 != s2 {
+			b.Fatal("s1 != s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringSameLength(b *testing.B) {
+	s1 := "Hello Gophers!"
+	s2 := "Hello, Gophers"
+	for i := 0; i < b.N; i++ {
+		if s1 == s2 {
+			b.Fatal("s1 == s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringDifferentLength(b *testing.B) {
+	s1 := "Hello Gophers!"
+	s2 := "Hello, Gophers!"
+	for i := 0; i < b.N; i++ {
+		if s1 == s2 {
+			b.Fatal("s1 == s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringBigUnaligned(b *testing.B) {
+	bytes := make([]byte, 0, 1<<20)
+	for len(bytes) < 1<<20 {
+		bytes = append(bytes, "Hello Gophers!"...)
+	}
+	s1, s2 := string(bytes), "hello"+string(bytes)
+	for i := 0; i < b.N; i++ {
+		if s1 != s2[len("hello"):] {
+			b.Fatal("s1 != s2")
+		}
+	}
+	b.SetBytes(int64(len(s1)))
+}
+
+func BenchmarkCompareStringBig(b *testing.B) {
+	bytes := make([]byte, 0, 1<<20)
+	for len(bytes) < 1<<20 {
+		bytes = append(bytes, "Hello Gophers!"...)
+	}
+	s1, s2 := string(bytes), string(bytes)
+	for i := 0; i < b.N; i++ {
+		if s1 != s2 {
+			b.Fatal("s1 != s2")
+		}
+	}
+	b.SetBytes(int64(len(s1)))
+}
+
+func BenchmarkRuneIterate(b *testing.B) {
+	bytes := make([]byte, 100)
+	for i := range bytes {
+		bytes[i] = byte('A')
+	}
+	s := string(bytes)
+	for i := 0; i < b.N; i++ {
+		for range s {
+		}
+	}
+}
+
+func BenchmarkRuneIterate2(b *testing.B) {
+	bytes := make([]byte, 100)
+	for i := range bytes {
+		bytes[i] = byte('A')
+	}
+	s := string(bytes)
+	for i := 0; i < b.N; i++ {
+		for range s {
+		}
+	}
+}
+
+func TestStringW(t *testing.T) {
+	strings := []string{
+		"hello",
+		"a\u5566\u7788b",
+	}
+
+	for _, s := range strings {
+		var b []uint16
+		for _, c := range s {
+			b = append(b, uint16(c))
+			if c != rune(uint16(c)) {
+				t.Errorf("bad test: stringW can't handle >16 bit runes")
+			}
+		}
+		b = append(b, 0)
+		r := runtime.GostringW(b)
+		if r != s {
+			t.Errorf("gostringW(%v) = %s, want %s", b, r, s)
+		}
+	}
+}
+
+func TestLargeStringConcat(t *testing.T) {
+	output := executeTest(t, largeStringConcatSource, nil)
+	want := "panic: " + strings.Repeat("0", 1<<10) + strings.Repeat("1", 1<<10) +
+		strings.Repeat("2", 1<<10) + strings.Repeat("3", 1<<10)
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+var largeStringConcatSource = `
+package main
+import "strings"
+func main() {
+	s0 := strings.Repeat("0", 1<<10)
+	s1 := strings.Repeat("1", 1<<10)
+	s2 := strings.Repeat("2", 1<<10)
+	s3 := strings.Repeat("3", 1<<10)
+	s := s0 + s1 + s2 + s3
+	panic(s)
+}
+`
+
+func TestGostringnocopy(t *testing.T) {
+	max := *runtime.Maxstring
+	b := make([]byte, max+10)
+	for i := uintptr(0); i < max+9; i++ {
+		b[i] = 'a'
+	}
+	_ = runtime.Gostringnocopy(&b[0])
+	newmax := *runtime.Maxstring
+	if newmax != max+9 {
+		t.Errorf("want %d, got %d", max+9, newmax)
+	}
+}
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
new file mode 100644
index 000000000..fe8f9c922
--- /dev/null
+++ b/src/runtime/stubs.go
@@ -0,0 +1,316 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// Declarations for runtime services implemented in C or assembly.
+
+const ptrSize = 4 << (^uintptr(0) >> 63) // unsafe.Sizeof(uintptr(0)) but an ideal const
+const regSize = 4 << (^uintreg(0) >> 63) // unsafe.Sizeof(uintreg(0)) but an ideal const
+
+// Should be a built-in for unsafe.Pointer?
+//go:nosplit
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+// n must be a power of 2
+func roundup(p unsafe.Pointer, n uintptr) unsafe.Pointer {
+	delta := -uintptr(p) & (n - 1)
+	return unsafe.Pointer(uintptr(p) + delta)
+}
+
+// in runtime.c
+func getg() *g
+func acquirem() *m
+func releasem(mp *m)
+func gomcache() *mcache
+func readgstatus(*g) uint32 // proc.c
+
+// mcall switches from the g to the g0 stack and invokes fn(g),
+// where g is the goroutine that made the call.
+// mcall saves g's current PC/SP in g->sched so that it can be restored later.
+// It is up to fn to arrange for that later execution, typically by recording
+// g in a data structure, causing something to call ready(g) later.
+// mcall returns to the original goroutine g later, when g has been rescheduled.
+// fn must not return at all; typically it ends by calling schedule, to let the m
+// run other goroutines.
+//
+// mcall can only be called from g stacks (not g0, not gsignal).
+//go:noescape
+func mcall(fn func(*g))
+
+// onM switches from the g to the g0 stack and invokes fn().
+// When fn returns, onM switches back to the g and returns,
+// continuing execution on the g stack.
+// If arguments must be passed to fn, they can be written to
+// g->m->ptrarg (pointers) and g->m->scalararg (non-pointers)
+// before the call and then consulted during fn.
+// Similarly, fn can pass return values back in those locations.
+// If fn is written in Go, it can be a closure, which avoids the need for
+// ptrarg and scalararg entirely.
+// After reading values out of ptrarg and scalararg it is conventional
+// to zero them to avoid (memory or information) leaks.
+//
+// If onM is called from a g0 stack, it invokes fn and returns,
+// without any stack switches.
+//
+// If onM is called from a gsignal stack, it crashes the program.
+// The implication is that functions used in signal handlers must
+// not use onM.
+//
+// NOTE(rsc): We could introduce a separate onMsignal that is
+// like onM but if called from a gsignal stack would just run fn on
+// that stack. The caller of onMsignal would be required to save the
+// old values of ptrarg/scalararg and restore them when the call
+// was finished, in case the signal interrupted an onM sequence
+// in progress on the g or g0 stacks. Until there is a clear need for this,
+// we just reject onM in signal handling contexts entirely.
+//
+//go:noescape
+func onM(fn func())
+
+// onMsignal is like onM but is allowed to be used in code that
+// might run on the gsignal stack. Code running on a signal stack
+// may be interrupting an onM sequence on the main stack, so
+// if the onMsignal calling sequence writes to ptrarg/scalararg,
+// it must first save the old values and then restore them when
+// finished. As an exception to the rule, it is fine not to save and
+// restore the values if the program is trying to crash rather than
+// return from the signal handler.
+// Once all the runtime is written in Go, there will be no ptrarg/scalararg
+// and the distinction between onM and onMsignal (and perhaps mcall)
+// can go away.
+//
+// If onMsignal is called from a gsignal stack, it invokes fn directly,
+// without a stack switch. Otherwise onMsignal behaves like onM.
+//
+//go:noescape
+func onM_signalok(fn func())
+
+func badonm() {
+	gothrow("onM called from signal goroutine")
+}
+
+// C functions that run on the M stack.
+// Call using mcall.
+func gosched_m(*g)
+func park_m(*g)
+func recovery_m(*g)
+
+// More C functions that run on the M stack.
+// Call using onM.
+func mcacheRefill_m()
+func largeAlloc_m()
+func gc_m()
+func scavenge_m()
+func setFinalizer_m()
+func removeFinalizer_m()
+func markallocated_m()
+func unrollgcprog_m()
+func unrollgcproginplace_m()
+func setgcpercent_m()
+func setmaxthreads_m()
+func ready_m()
+func deferproc_m()
+func goexit_m()
+func startpanic_m()
+func dopanic_m()
+func readmemstats_m()
+func writeheapdump_m()
+
+// memclr clears n bytes starting at ptr.
+// in memclr_*.s
+//go:noescape
+func memclr(ptr unsafe.Pointer, n uintptr)
+
+// memmove copies n bytes from "from" to "to".
+// in memmove_*.s
+//go:noescape
+func memmove(to unsafe.Pointer, from unsafe.Pointer, n uintptr)
+
+func starttheworld()
+func stoptheworld()
+func newextram()
+func lockOSThread()
+func unlockOSThread()
+
+// exported value for testing
+var hashLoad = loadFactor
+
+// in asm_*.s
+func fastrand1() uint32
+
+// in asm_*.s
+//go:noescape
+func memeq(a, b unsafe.Pointer, size uintptr) bool
+
+// noescape hides a pointer from escape analysis.  noescape is
+// the identity function but escape analysis doesn't think the
+// output depends on the input.  noescape is inlined and currently
+// compiles down to a single xor instruction.
+// USE CAREFULLY!
+//go:nosplit
+func noescape(p unsafe.Pointer) unsafe.Pointer {
+	x := uintptr(p)
+	return unsafe.Pointer(x ^ 0)
+}
+
+func entersyscall()
+func reentersyscall(pc uintptr, sp unsafe.Pointer)
+func entersyscallblock()
+func exitsyscall()
+
+func cgocallback(fn, frame unsafe.Pointer, framesize uintptr)
+func gogo(buf *gobuf)
+func gosave(buf *gobuf)
+func read(fd int32, p unsafe.Pointer, n int32) int32
+func close(fd int32) int32
+func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+
+//go:noescape
+func jmpdefer(fv *funcval, argp uintptr)
+func exit1(code int32)
+func asminit()
+func setg(gg *g)
+func exit(code int32)
+func breakpoint()
+func nanotime() int64
+func usleep(usec uint32)
+
+// careful: cputicks is not guaranteed to be monotonic!  In particular, we have
+// noticed drift between cpus on certain os/arch combinations.  See issue 8976.
+func cputicks() int64
+
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+func munmap(addr unsafe.Pointer, n uintptr)
+func madvise(addr unsafe.Pointer, n uintptr, flags int32)
+func reflectcall(fn, arg unsafe.Pointer, n uint32, retoffset uint32)
+func osyield()
+func procyield(cycles uint32)
+func cgocallback_gofunc(fv *funcval, frame unsafe.Pointer, framesize uintptr)
+func readgogc() int32
+func purgecachedstats(c *mcache)
+func gostringnocopy(b *byte) string
+func goexit()
+
+//go:noescape
+func write(fd uintptr, p unsafe.Pointer, n int32) int32
+
+//go:noescape
+func cas(ptr *uint32, old, new uint32) bool
+
+//go:noescape
+func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+
+//go:noescape
+func casuintptr(ptr *uintptr, old, new uintptr) bool
+
+//go:noescape
+func atomicstoreuintptr(ptr *uintptr, new uintptr)
+
+//go:noescape
+func atomicloaduintptr(ptr *uintptr) uintptr
+
+//go:noescape
+func atomicloaduint(ptr *uint) uint
+
+//go:noescape
+func setcallerpc(argp unsafe.Pointer, pc uintptr)
+
+// getcallerpc returns the program counter (PC) of its caller's caller.
+// getcallersp returns the stack pointer (SP) of its caller's caller.
+// For both, the argp must be a pointer to the caller's first function argument.
+// The implementation may or may not use argp, depending on
+// the architecture.
+//
+// For example:
+//
+//	func f(arg1, arg2, arg3 int) {
+//		pc := getcallerpc(unsafe.Pointer(&arg1))
+//		sp := getcallerpc(unsafe.Pointer(&arg2))
+//	}
+//
+// These two lines find the PC and SP immediately following
+// the call to f (where f will return).
+//
+// The call to getcallerpc and getcallersp must be done in the
+// frame being asked about. It would not be correct for f to pass &arg1
+// to another function g and let g call getcallerpc/getcallersp.
+// The call inside g might return information about g's caller or
+// information about f's caller or complete garbage.
+//
+// The result of getcallersp is correct at the time of the return,
+// but it may be invalidated by any subsequent call to a function
+// that might relocate the stack in order to grow or shrink it.
+// A general rule is that the result of getcallersp should be used
+// immediately and can only be passed to nosplit functions.
+
+//go:noescape
+func getcallerpc(argp unsafe.Pointer) uintptr
+
+//go:noescape
+func getcallersp(argp unsafe.Pointer) uintptr
+
+//go:noescape
+func asmcgocall(fn, arg unsafe.Pointer)
+
+//go:noescape
+func asmcgocall_errno(fn, arg unsafe.Pointer) int32
+
+//go:noescape
+func open(name *byte, mode, perm int32) int32
+
+//go:noescape
+func gotraceback(*bool) int32
+
+const _NoArgs = ^uintptr(0)
+
+func newstack()
+func newproc()
+func morestack()
+func mstart()
+func rt0_go()
+
+// return0 is a stub used to return 0 from deferproc.
+// It is called at the very end of deferproc to signal
+// the calling Go function that it should not jump
+// to deferreturn.
+// in asm_*.s
+func return0()
+
+// thunk to call time.now.
+func timenow() (sec int64, nsec int32)
+
+// in asm_*.s
+// not called directly; definitions here supply type information for traceback.
+func call16(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call32(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call64(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call128(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call256(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call512(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call1024(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call2048(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call4096(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call8192(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call16384(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call32768(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call65536(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call131072(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call262144(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call524288(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call1048576(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call2097152(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call4194304(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call8388608(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call16777216(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call33554432(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call67108864(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call134217728(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call268435456(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call536870912(fn, arg unsafe.Pointer, n, retoffset uint32)
+func call1073741824(fn, arg unsafe.Pointer, n, retoffset uint32)
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
new file mode 100644
index 000000000..45d107b77
--- /dev/null
+++ b/src/runtime/symtab.go
@@ -0,0 +1,288 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// NOTE: Func does not expose the actual unexported fields, because we return *Func
+// values to users, and we want to keep them from being able to overwrite the data
+// with (say) *f = Func{}.
+// All code operating on a *Func must call raw to get the *_func instead.
+
+// A Func represents a Go function in the running binary.
+type Func struct {
+	opaque struct{} // unexported field to disallow conversions
+}
+
+func (f *Func) raw() *_func {
+	return (*_func)(unsafe.Pointer(f))
+}
+
+// funcdata.h
+const (
+	_PCDATA_ArgSize             = 0
+	_PCDATA_StackMapIndex       = 1
+	_FUNCDATA_ArgsPointerMaps   = 0
+	_FUNCDATA_LocalsPointerMaps = 1
+	_FUNCDATA_DeadValueMaps     = 2
+	_ArgsSizeUnknown            = -0x80000000
+)
+
+var (
+	pclntable []byte
+	ftab      []functab
+	filetab   []uint32
+
+	pclntab, epclntab struct{} // linker symbols
+)
+
+type functab struct {
+	entry   uintptr
+	funcoff uintptr
+}
+
+func symtabinit() {
+	// See golang.org/s/go12symtab for header: 0xfffffffb,
+	// two zero bytes, a byte giving the PC quantum,
+	// and a byte giving the pointer width in bytes.
+	pcln := (*[8]byte)(unsafe.Pointer(&pclntab))
+	pcln32 := (*[2]uint32)(unsafe.Pointer(&pclntab))
+	if pcln32[0] != 0xfffffffb || pcln[4] != 0 || pcln[5] != 0 || pcln[6] != _PCQuantum || pcln[7] != ptrSize {
+		println("runtime: function symbol table header:", hex(pcln32[0]), hex(pcln[4]), hex(pcln[5]), hex(pcln[6]), hex(pcln[7]))
+		gothrow("invalid function symbol table\n")
+	}
+
+	// pclntable is all bytes of pclntab symbol.
+	sp := (*sliceStruct)(unsafe.Pointer(&pclntable))
+	sp.array = unsafe.Pointer(&pclntab)
+	sp.len = int(uintptr(unsafe.Pointer(&epclntab)) - uintptr(unsafe.Pointer(&pclntab)))
+	sp.cap = sp.len
+
+	// ftab is lookup table for function by program counter.
+	nftab := int(*(*uintptr)(add(unsafe.Pointer(pcln), 8)))
+	p := add(unsafe.Pointer(pcln), 8+ptrSize)
+	sp = (*sliceStruct)(unsafe.Pointer(&ftab))
+	sp.array = p
+	sp.len = nftab + 1
+	sp.cap = sp.len
+	for i := 0; i < nftab; i++ {
+		// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
+		if ftab[i].entry > ftab[i+1].entry {
+			f1 := (*_func)(unsafe.Pointer(&pclntable[ftab[i].funcoff]))
+			f2 := (*_func)(unsafe.Pointer(&pclntable[ftab[i+1].funcoff]))
+			f2name := "end"
+			if i+1 < nftab {
+				f2name = gofuncname(f2)
+			}
+			println("function symbol table not sorted by program counter:", hex(ftab[i].entry), gofuncname(f1), ">", hex(ftab[i+1].entry), f2name)
+			for j := 0; j <= i; j++ {
+				print("\t", hex(ftab[j].entry), " ", gofuncname((*_func)(unsafe.Pointer(&pclntable[ftab[j].funcoff]))))
+			}
+			gothrow("invalid runtime symbol table")
+		}
+	}
+
+	// The ftab ends with a half functab consisting only of
+	// 'entry', followed by a uint32 giving the pcln-relative
+	// offset of the file table.
+	sp = (*sliceStruct)(unsafe.Pointer(&filetab))
+	end := unsafe.Pointer(&ftab[nftab].funcoff) // just beyond ftab
+	fileoffset := *(*uint32)(end)
+	sp.array = unsafe.Pointer(&pclntable[fileoffset])
+	// length is in first element of array.
+	// set len to 1 so we can get first element.
+	sp.len = 1
+	sp.cap = 1
+	sp.len = int(filetab[0])
+	sp.cap = sp.len
+}
+
+// FuncForPC returns a *Func describing the function that contains the
+// given program counter address, or else nil.
+func FuncForPC(pc uintptr) *Func {
+	return (*Func)(unsafe.Pointer(findfunc(pc)))
+}
+
+// Name returns the name of the function.
+func (f *Func) Name() string {
+	return gofuncname(f.raw())
+}
+
+// Entry returns the entry address of the function.
+func (f *Func) Entry() uintptr {
+	return f.raw().entry
+}
+
+// FileLine returns the file name and line number of the
+// source code corresponding to the program counter pc.
+// The result will not be accurate if pc is not a program
+// counter within f.
+func (f *Func) FileLine(pc uintptr) (file string, line int) {
+	// Pass strict=false here, because anyone can call this function,
+	// and they might just be wrong about targetpc belonging to f.
+	line = int(funcline1(f.raw(), pc, &file, false))
+	return file, line
+}
+
+func findfunc(pc uintptr) *_func {
+	if len(ftab) == 0 {
+		return nil
+	}
+
+	if pc < ftab[0].entry || pc >= ftab[len(ftab)-1].entry {
+		return nil
+	}
+
+	// binary search to find func with entry <= pc.
+	lo := 0
+	nf := len(ftab) - 1 // last entry is sentinel
+	for nf > 0 {
+		n := nf / 2
+		f := &ftab[lo+n]
+		if f.entry <= pc && pc < ftab[lo+n+1].entry {
+			return (*_func)(unsafe.Pointer(&pclntable[f.funcoff]))
+		} else if pc < f.entry {
+			nf = n
+		} else {
+			lo += n + 1
+			nf -= n + 1
+		}
+	}
+
+	gothrow("findfunc: binary search failed")
+	return nil
+}
+
+func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
+	if off == 0 {
+		return -1
+	}
+	p := pclntable[off:]
+	pc := f.entry
+	val := int32(-1)
+	for {
+		var ok bool
+		p, ok = step(p, &pc, &val, pc == f.entry)
+		if !ok {
+			break
+		}
+		if targetpc < pc {
+			return val
+		}
+	}
+
+	// If there was a table, it should have covered all program counters.
+	// If not, something is wrong.
+	if panicking != 0 || !strict {
+		return -1
+	}
+
+	print("runtime: invalid pc-encoded table f=", gofuncname(f), " pc=", hex(pc), " targetpc=", hex(targetpc), " tab=", p, "\n")
+
+	p = pclntable[off:]
+	pc = f.entry
+	val = -1
+	for {
+		var ok bool
+		p, ok = step(p, &pc, &val, pc == f.entry)
+		if !ok {
+			break
+		}
+		print("\tvalue=", val, " until pc=", hex(pc), "\n")
+	}
+
+	gothrow("invalid runtime symbol table")
+	return -1
+}
+
+func funcname(f *_func) *byte {
+	if f == nil || f.nameoff == 0 {
+		return nil
+	}
+	return (*byte)(unsafe.Pointer(&pclntable[f.nameoff]))
+}
+
+func gofuncname(f *_func) string {
+	return gostringnocopy(funcname(f))
+}
+
+func funcline1(f *_func, targetpc uintptr, file *string, strict bool) int32 {
+	*file = "?"
+	fileno := int(pcvalue(f, f.pcfile, targetpc, strict))
+	line := pcvalue(f, f.pcln, targetpc, strict)
+	if fileno == -1 || line == -1 || fileno >= len(filetab) {
+		// print("looking for ", hex(targetpc), " in ", gofuncname(f), " got file=", fileno, " line=", lineno, "\n")
+		return 0
+	}
+	*file = gostringnocopy(&pclntable[filetab[fileno]])
+	return line
+}
+
+func funcline(f *_func, targetpc uintptr, file *string) int32 {
+	return funcline1(f, targetpc, file, true)
+}
+
+func funcspdelta(f *_func, targetpc uintptr) int32 {
+	x := pcvalue(f, f.pcsp, targetpc, true)
+	if x&(ptrSize-1) != 0 {
+		print("invalid spdelta ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
+	}
+	return x
+}
+
+func pcdatavalue(f *_func, table int32, targetpc uintptr) int32 {
+	if table < 0 || table >= f.npcdata {
+		return -1
+	}
+	off := *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
+	return pcvalue(f, off, targetpc, true)
+}
+
+func funcdata(f *_func, i int32) unsafe.Pointer {
+	if i < 0 || i >= f.nfuncdata {
+		return nil
+	}
+	p := add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(f.npcdata)*4)
+	if ptrSize == 8 && uintptr(p)&4 != 0 {
+		if uintptr(unsafe.Pointer(f))&4 != 0 {
+			println("runtime: misaligned func", f)
+		}
+		p = add(p, 4)
+	}
+	return *(*unsafe.Pointer)(add(p, uintptr(i)*ptrSize))
+}
+
+// step advances to the next pc, value pair in the encoded table.
+func step(p []byte, pc *uintptr, val *int32, first bool) (newp []byte, ok bool) {
+	p, uvdelta := readvarint(p)
+	if uvdelta == 0 && !first {
+		return nil, false
+	}
+	if uvdelta&1 != 0 {
+		uvdelta = ^(uvdelta >> 1)
+	} else {
+		uvdelta >>= 1
+	}
+	vdelta := int32(uvdelta)
+	p, pcdelta := readvarint(p)
+	*pc += uintptr(pcdelta * _PCQuantum)
+	*val += vdelta
+	return p, true
+}
+
+// readvarint reads a varint from p.
+func readvarint(p []byte) (newp []byte, val uint32) {
+	var v, shift uint32
+	for {
+		b := p[0]
+		p = p[1:]
+		v |= (uint32(b) & 0x7F) << shift
+		if b&0x80 == 0 {
+			break
+		}
+		shift += 7
+	}
+	return p, v
+}
diff --git a/src/runtime/symtab_test.go b/src/runtime/symtab_test.go
new file mode 100644
index 000000000..bd9fe18c4
--- /dev/null
+++ b/src/runtime/symtab_test.go
@@ -0,0 +1,47 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func TestCaller(t *testing.T) {
+	procs := runtime.GOMAXPROCS(-1)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for i := 0; i < 1000; i++ {
+				testCallerFoo(t)
+			}
+			c <- true
+		}()
+		defer func() {
+			<-c
+		}()
+	}
+}
+
+func testCallerFoo(t *testing.T) {
+	testCallerBar(t)
+}
+
+func testCallerBar(t *testing.T) {
+	for i := 0; i < 2; i++ {
+		pc, file, line, ok := runtime.Caller(i)
+		f := runtime.FuncForPC(pc)
+		if !ok ||
+			!strings.HasSuffix(file, "symtab_test.go") ||
+			(i == 0 && !strings.HasSuffix(f.Name(), "testCallerBar")) ||
+			(i == 1 && !strings.HasSuffix(f.Name(), "testCallerFoo")) ||
+			line < 5 || line > 1000 ||
+			f.Entry() >= pc {
+			t.Errorf("incorrect symbol info %d: %t %d %d %s %s %d",
+				i, ok, f.Entry(), pc, f.Name(), file, line)
+		}
+	}
+}
diff --git a/src/runtime/sys_arm.c b/src/runtime/sys_arm.c
new file mode 100644
index 000000000..a65560e5b
--- /dev/null
+++ b/src/runtime/sys_arm.c
@@ -0,0 +1,35 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+// adjust Gobuf as if it executed a call to fn with context ctxt
+// and then did an immediate Gosave.
+void
+runtime·gostartcall(Gobuf *gobuf, void (*fn)(void), void *ctxt)
+{
+	if(gobuf->lr != 0)
+		runtime·throw("invalid use of gostartcall");
+	gobuf->lr = gobuf->pc;
+	gobuf->pc = (uintptr)fn;
+	gobuf->ctxt = ctxt;
+}
+
+// Called to rewind context saved during morestack back to beginning of function.
+// To help us, the linker emits a jmp back to the beginning right after the
+// call to morestack. We just have to decode and apply that jump.
+void
+runtime·rewindmorestack(Gobuf *gobuf)
+{
+	uint32 inst;
+
+	inst = *(uint32*)gobuf->pc;
+	if((gobuf->pc&3) == 0 && (inst>>24) == 0x9a) {
+		//runtime·printf("runtime: rewind pc=%p to pc=%p\n", gobuf->pc, gobuf->pc + ((int32)(inst<<8)>>6) + 8);
+		gobuf->pc += ((int32)(inst<<8)>>6) + 8;
+		return;
+	}
+	runtime·printf("runtime: pc=%p %x\n", gobuf->pc, inst);
+	runtime·throw("runtime: misuse of rewindmorestack");
+}
diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s
new file mode 100644
index 000000000..a961c71a8
--- /dev/null
+++ b/src/runtime/sys_darwin_386.s
@@ -0,0 +1,541 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// System calls and other sys.stuff for 386, Darwin
+// See http://fxr.watson.org/fxr/source/bsd/kern/syscalls.c?v=xnu-1228
+// or /usr/include/sys/syscall.h (on a Mac) for system call numbers.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$0
+	MOVL	$1, AX
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// Exit this OS thread (like pthread_exit, which eventually
+// calls __bsdthread_terminate).
+TEXT runtime·exit1(SB),NOSPLIT,$0
+	MOVL	$361, AX
+	INT	$0x80
+	JAE 2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVL	$5, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVL	$6, AX
+	INT	$0x80
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0
+	MOVL	$3, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$0
+	MOVL	$4, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	MOVL	$20, AX // getpid
+	INT	$0x80
+	MOVL	AX, 4(SP)	// pid
+	MOVL	sig+0(FP), AX
+	MOVL	AX, 8(SP)	// signal
+	MOVL	$1, 12(SP)	// posix
+	MOVL	$37, AX // kill
+	INT	$0x80
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVL	$197, AX
+	INT	$0x80
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVL	$75, AX
+	INT	$0x80
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVL	$73, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$0
+	MOVL	$83, AX
+	INT	$0x80
+	RET
+
+// OS X comm page time offsets
+// http://www.opensource.apple.com/source/xnu/xnu-1699.26.8/osfmk/i386/cpu_capabilities.h
+#define	cpu_capabilities	0x20
+#define	nt_tsc_base	0x50
+#define	nt_scale	0x58
+#define	nt_shift	0x5c
+#define	nt_ns_base	0x60
+#define	nt_generation	0x68
+#define	gtod_generation	0x6c
+#define	gtod_ns_base	0x70
+#define	gtod_sec_base	0x78
+
+// called from assembly
+// 64-bit unix nanoseconds returned in DX:AX.
+// I'd much rather write this in C but we need
+// assembly for the 96-bit multiply and RDTSC.
+TEXT runtime·now(SB),NOSPLIT,$40
+	MOVL	$0xffff0000, BP /* comm page base */
+	
+	// Test for slow CPU. If so, the math is completely
+	// different, and unimplemented here, so use the
+	// system call.
+	MOVL	cpu_capabilities(BP), AX
+	TESTL	$0x4000, AX
+	JNZ	systime
+
+	// Loop trying to take a consistent snapshot
+	// of the time parameters.
+timeloop:
+	MOVL	gtod_generation(BP), BX
+	TESTL	BX, BX
+	JZ	systime
+	MOVL	nt_generation(BP), CX
+	TESTL	CX, CX
+	JZ	timeloop
+	RDTSC
+	MOVL	nt_tsc_base(BP), SI
+	MOVL	(nt_tsc_base+4)(BP), DI
+	MOVL	SI, 0(SP)
+	MOVL	DI, 4(SP)
+	MOVL	nt_scale(BP), SI
+	MOVL	SI, 8(SP)
+	MOVL	nt_ns_base(BP), SI
+	MOVL	(nt_ns_base+4)(BP), DI
+	MOVL	SI, 12(SP)
+	MOVL	DI, 16(SP)
+	CMPL	nt_generation(BP), CX
+	JNE	timeloop
+	MOVL	gtod_ns_base(BP), SI
+	MOVL	(gtod_ns_base+4)(BP), DI
+	MOVL	SI, 20(SP)
+	MOVL	DI, 24(SP)
+	MOVL	gtod_sec_base(BP), SI
+	MOVL	(gtod_sec_base+4)(BP), DI
+	MOVL	SI, 28(SP)
+	MOVL	DI, 32(SP)
+	CMPL	gtod_generation(BP), BX
+	JNE	timeloop
+
+	// Gathered all the data we need. Compute time.
+	//	((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
+	// The multiply and shift extracts the top 64 bits of the 96-bit product.
+	SUBL	0(SP), AX // DX:AX = (tsc - nt_tsc_base)
+	SBBL	4(SP), DX
+
+	// We have x = tsc - nt_tsc_base - DX:AX to be
+	// multiplied by y = nt_scale = 8(SP), keeping the top 64 bits of the 96-bit product.
+	// x*y = (x&0xffffffff)*y + (x&0xffffffff00000000)*y
+	// (x*y)>>32 = ((x&0xffffffff)*y)>>32 + (x>>32)*y
+	MOVL	DX, CX // SI = (x&0xffffffff)*y >> 32
+	MOVL	$0, DX
+	MULL	8(SP)
+	MOVL	DX, SI
+
+	MOVL	CX, AX // DX:AX = (x>>32)*y
+	MOVL	$0, DX
+	MULL	8(SP)
+
+	ADDL	SI, AX	// DX:AX += (x&0xffffffff)*y >> 32
+	ADCL	$0, DX
+	
+	// DX:AX is now ((tsc - nt_tsc_base) * nt_scale) >> 32.
+	ADDL	12(SP), AX	// DX:AX += nt_ns_base
+	ADCL	16(SP), DX
+	SUBL	20(SP), AX	// DX:AX -= gtod_ns_base
+	SBBL	24(SP), DX
+	MOVL	AX, SI	// DI:SI = DX:AX
+	MOVL	DX, DI
+	MOVL	28(SP), AX	// DX:AX = gtod_sec_base*1e9
+	MOVL	32(SP), DX
+	MOVL	$1000000000, CX
+	MULL	CX
+	ADDL	SI, AX	// DX:AX += DI:SI
+	ADCL	DI, DX
+	RET
+
+systime:
+	// Fall back to system call (usually first call in this thread)
+	LEAL	12(SP), AX	// must be non-nil, unused
+	MOVL	AX, 4(SP)
+	MOVL	$0, 8(SP)	// time zone pointer
+	MOVL	$116, AX
+	INT	$0x80
+	// sec is in AX, usec in DX
+	// convert to DX:AX nsec
+	MOVL	DX, BX
+	MOVL	$1000000000, CX
+	MULL	CX
+	IMULL	$1000, BX
+	ADDL	BX, AX
+	ADCL	$0, DX
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$0
+	CALL	runtime·now(SB)
+	MOVL	$1000000000, CX
+	DIVL	CX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB),NOSPLIT,$0
+	CALL	runtime·now(SB)
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVL	$329, AX  // pthread_sigmask (on OS X, sigprocmask==entire process)
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$0
+	MOVL	$46, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// Sigtramp's job is to call the actual signal handler.
+// It is called with the following arguments on the stack:
+//	0(FP)	"return address" - ignored
+//	4(FP)	actual handler
+//	8(FP)	signal number
+//	12(FP)	siginfo style
+//	16(FP)	siginfo
+//	20(FP)	context
+TEXT runtime·sigtramp(SB),NOSPLIT,$40
+	get_tls(CX)
+	
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	sig+8(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP 	sigtramp_ret
+
+	// save g
+	MOVL	DI, 20(SP)
+
+	// g = m->gsignal
+	MOVL	g_m(DI), BP
+	MOVL	m_gsignal(BP), BP
+	MOVL	BP, g(CX)
+
+	// copy arguments to sighandler
+	MOVL	sig+8(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	info+12(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	context+16(FP), BX
+	MOVL	BX, 8(SP)
+	MOVL	DI, 12(SP)
+
+	MOVL	handler+0(FP), BX
+	CALL	BX
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), DI
+	MOVL	DI, g(CX)
+
+sigtramp_ret:
+	// call sigreturn
+	MOVL	context+16(FP), CX
+	MOVL	style+4(FP), BX
+	MOVL	$0, 0(SP)	// "caller PC" - ignored
+	MOVL	CX, 4(SP)
+	MOVL	BX, 8(SP)
+	MOVL	$184, AX	// sigreturn(ucontext, infostyle)
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$0
+	MOVL	$53, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$32
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVL	AX, 24(SP)  // sec
+	MOVL	DX, 28(SP)  // usec
+
+	// select(0, 0, 0, 0, &tv)
+	MOVL	$0, 0(SP)  // "return PC" - ignored
+	MOVL	$0, 4(SP)
+	MOVL	$0, 8(SP)
+	MOVL	$0, 12(SP)
+	MOVL	$0, 16(SP)
+	LEAL	24(SP), AX
+	MOVL	AX, 20(SP)
+	MOVL	$93, AX
+	INT	$0x80
+	RET
+
+// void bsdthread_create(void *stk, M *mp, G *gp, void (*fn)(void))
+// System call args are: func arg stack pthread flags.
+TEXT runtime·bsdthread_create(SB),NOSPLIT,$32
+	MOVL	$360, AX
+	// 0(SP) is where the caller PC would be; kernel skips it
+	MOVL	fn+12(FP), BX
+	MOVL	BX, 4(SP)	// func
+	MOVL	mm+4(FP), BX
+	MOVL	BX, 8(SP)	// arg
+	MOVL	stk+0(FP), BX
+	MOVL	BX, 12(SP)	// stack
+	MOVL	gg+8(FP), BX
+	MOVL	BX, 16(SP)	// pthread
+	MOVL	$0x1000000, 20(SP)	// flags = PTHREAD_START_CUSTOM
+	INT	$0x80
+	JAE	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+16(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+// The thread that bsdthread_create creates starts executing here,
+// because we registered this function using bsdthread_register
+// at startup.
+//	AX = "pthread" (= g)
+//	BX = mach thread port
+//	CX = "func" (= fn)
+//	DX = "arg" (= m)
+//	DI = stack top
+//	SI = flags (= 0x1000000)
+//	SP = stack - C_32_STK_ALIGN
+TEXT runtime·bsdthread_start(SB),NOSPLIT,$0
+	// set up ldt 7+id to point at m->tls.
+	// m->tls is at m+40.  newosproc left
+	// the m->id in tls[0].
+	LEAL	m_tls(DX), BP
+	MOVL	0(BP), DI
+	ADDL	$7, DI	// m0 is LDT#7. count up.
+	// setldt(tls#, &tls, sizeof tls)
+	PUSHAL	// save registers
+	PUSHL	$32	// sizeof tls
+	PUSHL	BP	// &tls
+	PUSHL	DI	// tls #
+	CALL	runtime·setldt(SB)
+	POPL	AX
+	POPL	AX
+	POPL	AX
+	POPAL
+
+	// Now segment is established.  Initialize m, g.
+	get_tls(BP)
+	MOVL	AX, g(BP)
+	MOVL	DX, g_m(AX)
+	MOVL	BX, m_procid(DX)	// m->procid = thread port (for debuggers)
+	CALL	runtime·stackcheck(SB)		// smashes AX
+	CALL	CX	// fn()
+	CALL	runtime·exit1(SB)
+	RET
+
+// void bsdthread_register(void)
+// registers callbacks for threadstart (see bsdthread_create above
+// and wqthread and pthsize (not used).  returns 0 on success.
+TEXT runtime·bsdthread_register(SB),NOSPLIT,$40
+	MOVL	$366, AX
+	// 0(SP) is where kernel expects caller PC; ignored
+	MOVL	$runtime·bsdthread_start(SB), 4(SP)	// threadstart
+	MOVL	$0, 8(SP)	// wqthread, not used by us
+	MOVL	$0, 12(SP)	// pthsize, not used by us
+	MOVL	$0, 16(SP)	// dummy_value [sic]
+	MOVL	$0, 20(SP)	// targetconc_ptr
+	MOVL	$0, 24(SP)	// dispatchqueue_offset
+	INT	$0x80
+	JAE	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+0(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// Invoke Mach system call.
+// Assumes system call number in AX,
+// caller PC on stack, caller's caller PC next,
+// and then the system call arguments.
+//
+// Can be used for BSD too, but we don't,
+// because if you use this interface the BSD
+// system call numbers need an extra field
+// in the high 16 bits that seems to be the
+// argument count in bytes but is not always.
+// INT $0x80 works fine for those.
+TEXT runtime·sysenter(SB),NOSPLIT,$0
+	POPL	DX
+	MOVL	SP, CX
+	BYTE $0x0F; BYTE $0x34;  // SYSENTER
+	// returns to DX with SP set to CX
+
+TEXT runtime·mach_msg_trap(SB),NOSPLIT,$0
+	MOVL	$-31, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+28(FP)
+	RET
+
+TEXT runtime·mach_reply_port(SB),NOSPLIT,$0
+	MOVL	$-26, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·mach_task_self(SB),NOSPLIT,$0
+	MOVL	$-28, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+0(FP)
+	RET
+
+// Mach provides trap versions of the semaphore ops,
+// instead of requiring the use of RPC.
+
+// uint32 mach_semaphore_wait(uint32)
+TEXT runtime·mach_semaphore_wait(SB),NOSPLIT,$0
+	MOVL	$-36, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+4(FP)
+	RET
+
+// uint32 mach_semaphore_timedwait(uint32, uint32, uint32)
+TEXT runtime·mach_semaphore_timedwait(SB),NOSPLIT,$0
+	MOVL	$-38, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+12(FP)
+	RET
+
+// uint32 mach_semaphore_signal(uint32)
+TEXT runtime·mach_semaphore_signal(SB),NOSPLIT,$0
+	MOVL	$-33, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+4(FP)
+	RET
+
+// uint32 mach_semaphore_signal_all(uint32)
+TEXT runtime·mach_semaphore_signal_all(SB),NOSPLIT,$0
+	MOVL	$-34, AX
+	CALL	runtime·sysenter(SB)
+	MOVL	AX, ret+4(FP)
+	RET
+
+// setldt(int entry, int address, int limit)
+// entry and limit are ignored.
+TEXT runtime·setldt(SB),NOSPLIT,$32
+	MOVL	address+4(FP), BX	// aka base
+
+	/*
+	 * When linking against the system libraries,
+	 * we use its pthread_create and let it set up %gs
+	 * for us.  When we do that, the private storage
+	 * we get is not at 0(GS) but at 0x468(GS).
+	 * 8l rewrites 0(TLS) into 0x468(GS) for us.
+	 * To accommodate that rewrite, we translate the
+	 * address and limit here so that 0x468(GS) maps to 0(address).
+	 *
+	 * See cgo/gcc_darwin_386.c:/468 for the derivation
+	 * of the constant.
+	 */
+	SUBL	$0x468, BX
+
+	/*
+	 * Must set up as USER_CTHREAD segment because
+	 * Darwin forces that value into %gs for signal handlers,
+	 * and if we don't set one up, we'll get a recursive
+	 * fault trying to get into the signal handler.
+	 * Since we have to set one up anyway, it might as
+	 * well be the value we want.  So don't bother with
+	 * i386_set_ldt.
+	 */
+	MOVL	BX, 4(SP)
+	MOVL	$3, AX	// thread_fast_set_cthread_self - machdep call #3
+	INT	$0x82	// sic: 0x82, not 0x80, for machdep call
+
+	XORL	AX, AX
+	MOVW	GS, AX
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVL	$202, AX
+	INT	$0x80
+	JAE	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVL	$362, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	$363, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$32
+	MOVL	$92, AX  // fcntl
+	// 0(SP) is where the caller PC would be; kernel skips it
+	MOVL	fd+0(FP), BX
+	MOVL	BX, 4(SP)  // fd
+	MOVL	$2, 8(SP)  // F_SETFD
+	MOVL	$1, 12(SP)  // FD_CLOEXEC
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	RET
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
new file mode 100644
index 000000000..bd397d72a
--- /dev/null
+++ b/src/runtime/sys_darwin_amd64.s
@@ -0,0 +1,505 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for AMD64, Darwin
+// See http://fxr.watson.org/fxr/source/bsd/kern/syscalls.c?v=xnu-1228
+// or /usr/include/sys/syscall.h (on a Mac) for system call numbers.
+//
+// The low 24 bits are the system call number.
+// The high 8 bits specify the kind of system call: 1=Mach, 2=BSD, 3=Machine-Dependent.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$0
+	MOVL	code+0(FP), DI		// arg 1 exit status
+	MOVL	$(0x2000000+1), AX	// syscall entry
+	SYSCALL
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// Exit this OS thread (like pthread_exit, which eventually
+// calls __bsdthread_terminate).
+TEXT runtime·exit1(SB),NOSPLIT,$0
+	MOVL	code+0(FP), DI		// arg 1 exit status
+	MOVL	$(0x2000000+361), AX	// syscall entry
+	SYSCALL
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVQ	name+0(FP), DI		// arg 1 pathname
+	MOVL	mode+8(FP), SI		// arg 2 flags
+	MOVL	perm+12(FP), DX		// arg 3 mode
+	MOVL	$(0x2000000+5), AX	// syscall entry
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVL	$(0x2000000+6), AX	// syscall entry
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$(0x2000000+3), AX	// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$0
+	MOVQ	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$(0x2000000+4), AX	// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$24
+	MOVL	$(0x2000000+20), AX // getpid
+	SYSCALL
+	MOVQ	AX, DI	// arg 1 - pid
+	MOVL	sig+0(FP), SI	// arg 2 - signal
+	MOVL	$1, DX	// arg 3 - posix
+	MOVL	$(0x2000000+37), AX // kill
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB), NOSPLIT, $0
+	MOVL	mode+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	$(0x2000000+83), AX	// syscall entry
+	SYSCALL
+	RET
+
+TEXT runtime·madvise(SB), NOSPLIT, $0
+	MOVQ	addr+0(FP), DI		// arg 1 addr
+	MOVQ	n+8(FP), SI		// arg 2 len
+	MOVL	flags+16(FP), DX		// arg 3 advice
+	MOVL	$(0x2000000+75), AX	// syscall entry madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+// OS X comm page time offsets
+// http://www.opensource.apple.com/source/xnu/xnu-1699.26.8/osfmk/i386/cpu_capabilities.h
+#define	nt_tsc_base	0x50
+#define	nt_scale	0x58
+#define	nt_shift	0x5c
+#define	nt_ns_base	0x60
+#define	nt_generation	0x68
+#define	gtod_generation	0x6c
+#define	gtod_ns_base	0x70
+#define	gtod_sec_base	0x78
+
+TEXT nanotime<>(SB), NOSPLIT, $32
+	MOVQ	$0x7fffffe00000, BP	/* comm page base */
+	// Loop trying to take a consistent snapshot
+	// of the time parameters.
+timeloop:
+	MOVL	gtod_generation(BP), R8
+	TESTL	R8, R8
+	JZ	systime
+	MOVL	nt_generation(BP), R9
+	TESTL	R9, R9
+	JZ	timeloop
+	RDTSC
+	MOVQ	nt_tsc_base(BP), R10
+	MOVL	nt_scale(BP), R11
+	MOVQ	nt_ns_base(BP), R12
+	CMPL	nt_generation(BP), R9
+	JNE	timeloop
+	MOVQ	gtod_ns_base(BP), R13
+	MOVQ	gtod_sec_base(BP), R14
+	CMPL	gtod_generation(BP), R8
+	JNE	timeloop
+
+	// Gathered all the data we need. Compute time.
+	//	((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
+	// The multiply and shift extracts the top 64 bits of the 96-bit product.
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	SUBQ	R10, AX
+	MULQ	R11
+	SHRQ	$32, AX:DX
+	ADDQ	R12, AX
+	SUBQ	R13, AX
+	IMULQ	$1000000000, R14
+	ADDQ	R14, AX
+	RET
+
+systime:
+	// Fall back to system call (usually first call in this thread).
+	MOVQ	SP, DI	// must be non-nil, unused
+	MOVQ	$0, SI
+	MOVL	$(0x2000000+116), AX
+	SYSCALL
+	// sec is in AX, usec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	IMULQ	$1000, DX
+	ADDQ	DX, AX
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+	CALL	nanotime<>(SB)
+	MOVQ	AX, ret+0(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$0-12
+	CALL	nanotime<>(SB)
+
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	MOVQ	$1360296554856532783, AX
+	MULQ	CX
+	ADDQ	CX, DX
+	RCRQ	$1, DX
+	SHRQ	$29, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVL	sig+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	$(0x2000000+329), AX  // pthread_sigmask (on OS X, sigprocmask==entire process)
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$0
+	MOVL	mode+0(FP), DI		// arg 1 sig
+	MOVQ	new+8(FP), SI		// arg 2 act
+	MOVQ	old+16(FP), DX		// arg 3 oact
+	MOVQ	old+16(FP), CX		// arg 3 oact
+	MOVQ	old+16(FP), R10		// arg 3 oact
+	MOVL	$(0x2000000+46), AX	// syscall entry
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	get_tls(BX)
+
+	MOVQ	R8, 32(SP)	// save ucontext
+	MOVQ	SI, 40(SP)	// save infostyle
+
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	5(PC)
+	MOVL	DX, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP 	sigtramp_ret
+
+	// save g
+	MOVQ	R10, 48(SP)
+
+	// g = m->gsignal
+	MOVQ	g_m(R10), BP
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+
+	MOVL	DX, 0(SP)
+	MOVQ	CX, 8(SP)
+	MOVQ	R8, 16(SP)
+	MOVQ	R10, 24(SP)
+
+	CALL	DI
+
+	// restore g
+	get_tls(BX)
+	MOVQ	48(SP), R10
+	MOVQ	R10, g(BX)
+
+sigtramp_ret:
+	// call sigreturn
+	MOVL	$(0x2000000+184), AX	// sigreturn(ucontext, infostyle)
+	MOVQ	32(SP), DI	// saved ucontext
+	MOVQ	40(SP), SI	// saved infostyle
+	SYSCALL
+	INT $3	// not reached
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 addr
+	MOVQ	n+8(FP), SI		// arg 2 len
+	MOVL	prot+16(FP), DX		// arg 3 prot
+	MOVL	flags+20(FP), R10		// arg 4 flags
+	MOVL	fd+24(FP), R8		// arg 5 fid
+	MOVL	off+28(FP), R9		// arg 6 offset
+	MOVL	$(0x2000000+197), AX	// syscall entry
+	SYSCALL
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 addr
+	MOVQ	n+8(FP), SI		// arg 2 len
+	MOVL	$(0x2000000+73), AX	// syscall entry
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$0
+	MOVQ	new+8(SP), DI
+	MOVQ	old+16(SP), SI
+	MOVQ	$(0x2000000+53), AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)  // sec
+	MOVL	DX, 8(SP)  // usec
+
+	// select(0, 0, 0, 0, &tv)
+	MOVL	$0, DI
+	MOVL	$0, SI
+	MOVL	$0, DX
+	MOVL	$0, R10
+	MOVQ	SP, R8
+	MOVL	$(0x2000000+93), AX
+	SYSCALL
+	RET
+
+// void bsdthread_create(void *stk, M *mp, G *gp, void (*fn)(void))
+TEXT runtime·bsdthread_create(SB),NOSPLIT,$0
+	// Set up arguments to bsdthread_create system call.
+	// The ones in quotes pass through to the thread callback
+	// uninterpreted, so we can put whatever we want there.
+	MOVQ	fn+32(SP), DI	// "func"
+	MOVQ	mm+16(SP), SI	// "arg"
+	MOVQ	stk+8(SP), DX	// stack
+	MOVQ	gg+24(SP), R10	// "pthread"
+	MOVQ	$0x01000000, R8	// flags = PTHREAD_START_CUSTOM
+	MOVQ	$0, R9	// paranoia
+	MOVQ	$(0x2000000+360), AX	// bsdthread_create
+	SYSCALL
+	JCC 4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+32(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+32(FP)
+	RET
+
+// The thread that bsdthread_create creates starts executing here,
+// because we registered this function using bsdthread_register
+// at startup.
+//	DI = "pthread"
+//	SI = mach thread port
+//	DX = "func" (= fn)
+//	CX = "arg" (= m)
+//	R8 = stack
+//	R9 = flags (= 0)
+//	SP = stack - C_64_REDZONE_LEN (= stack - 128)
+TEXT runtime·bsdthread_start(SB),NOSPLIT,$0
+	MOVQ	R8, SP		// empirically, SP is very wrong but R8 is right
+
+	PUSHQ	DX
+	PUSHQ	CX
+	PUSHQ	SI
+
+	// set up thread local storage pointing at m->tls.
+	LEAQ	m_tls(CX), DI
+	CALL	runtime·settls(SB)
+
+	POPQ	SI
+	POPQ	CX
+	POPQ	DX
+
+	get_tls(BX)
+	MOVQ	SI, m_procid(CX)	// thread port is m->procid
+	MOVQ	m_g0(CX), AX
+	MOVQ	AX, g(BX)
+	MOVQ	CX, g_m(AX)
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	CALL	DX	// fn
+	CALL	runtime·exit1(SB)
+	RET
+
+// void bsdthread_register(void)
+// registers callbacks for threadstart (see bsdthread_create above
+// and wqthread and pthsize (not used).  returns 0 on success.
+TEXT runtime·bsdthread_register(SB),NOSPLIT,$0
+	MOVQ	$runtime·bsdthread_start(SB), DI	// threadstart
+	MOVQ	$0, SI	// wqthread, not used by us
+	MOVQ	$0, DX	// pthsize, not used by us
+	MOVQ	$0, R10	// dummy_value [sic]
+	MOVQ	$0, R8	// targetconc_ptr
+	MOVQ	$0, R9	// dispatchqueue_offset
+	MOVQ	$(0x2000000+366), AX	// bsdthread_register
+	SYSCALL
+	JCC 4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+0(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// Mach system calls use 0x1000000 instead of the BSD's 0x2000000.
+
+// uint32 mach_msg_trap(void*, uint32, uint32, uint32, uint32, uint32, uint32)
+TEXT runtime·mach_msg_trap(SB),NOSPLIT,$0
+	MOVQ	h+0(FP), DI
+	MOVL	op+8(FP), SI
+	MOVL	send_size+12(FP), DX
+	MOVL	rcv_size+16(FP), R10
+	MOVL	rcv_name+20(FP), R8
+	MOVL	timeout+24(FP), R9
+	MOVL	notify+28(FP), R11
+	PUSHQ	R11	// seventh arg, on stack
+	MOVL	$(0x1000000+31), AX	// mach_msg_trap
+	SYSCALL
+	POPQ	R11
+	MOVL	AX, ret+32(FP)
+	RET
+
+TEXT runtime·mach_task_self(SB),NOSPLIT,$0
+	MOVL	$(0x1000000+28), AX	// task_self_trap
+	SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·mach_thread_self(SB),NOSPLIT,$0
+	MOVL	$(0x1000000+27), AX	// thread_self_trap
+	SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·mach_reply_port(SB),NOSPLIT,$0
+	MOVL	$(0x1000000+26), AX	// mach_reply_port
+	SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET
+
+// Mach provides trap versions of the semaphore ops,
+// instead of requiring the use of RPC.
+
+// uint32 mach_semaphore_wait(uint32)
+TEXT runtime·mach_semaphore_wait(SB),NOSPLIT,$0
+	MOVL	sema+0(FP), DI
+	MOVL	$(0x1000000+36), AX	// semaphore_wait_trap
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// uint32 mach_semaphore_timedwait(uint32, uint32, uint32)
+TEXT runtime·mach_semaphore_timedwait(SB),NOSPLIT,$0
+	MOVL	sema+0(FP), DI
+	MOVL	sec+4(FP), SI
+	MOVL	nsec+8(FP), DX
+	MOVL	$(0x1000000+38), AX	// semaphore_timedwait_trap
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+// uint32 mach_semaphore_signal(uint32)
+TEXT runtime·mach_semaphore_signal(SB),NOSPLIT,$0
+	MOVL	sema+0(FP), DI
+	MOVL	$(0x1000000+33), AX	// semaphore_signal_trap
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// uint32 mach_semaphore_signal_all(uint32)
+TEXT runtime·mach_semaphore_signal_all(SB),NOSPLIT,$0
+	MOVL	sema+0(FP), DI
+	MOVL	$(0x1000000+34), AX	// semaphore_signal_all_trap
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$32
+	/*
+	* Same as in sys_darwin_386.s:/ugliness, different constant.
+	* See cgo/gcc_darwin_amd64.c for the derivation
+	* of the constant.
+	*/
+	SUBQ $0x8a0, DI
+
+	MOVL	$(0x3000000+3), AX	// thread_fast_set_cthread_self - machdep call #3
+	SYSCALL
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVQ	mib+0(FP), DI
+	MOVL	miblen+8(FP), SI
+	MOVQ	out+16(FP), DX
+	MOVQ	size+24(FP), R10
+	MOVQ	dst+32(FP), R8
+	MOVQ	ndst+40(FP), R9
+	MOVL	$(0x2000000+202), AX	// syscall entry
+	SYSCALL
+	JCC 4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVQ    $0, DI
+	MOVQ    $0, SI
+	MOVQ    $0, DX
+	MOVL	$(0x2000000+362), AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL    fd+0(FP), DI
+	MOVQ    ev1+8(FP), SI
+	MOVL    nev1+16(FP), DX
+	MOVQ    ev2+24(FP), R10
+	MOVL    nev2+32(FP), R8
+	MOVQ    ts+40(FP), R9
+	MOVL	$(0x2000000+363), AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $2, SI  // F_SETFD
+	MOVQ    $1, DX  // FD_CLOEXEC
+	MOVL	$(0x2000000+92), AX  // fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_dragonfly_386.s b/src/runtime/sys_dragonfly_386.s
new file mode 100644
index 000000000..161eaec19
--- /dev/null
+++ b/src/runtime/sys_dragonfly_386.s
@@ -0,0 +1,381 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for 386, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+	
+TEXT runtime·sys_umtx_sleep(SB),NOSPLIT,$-4
+	MOVL	$469, AX		// umtx_sleep
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·sys_umtx_wakeup(SB),NOSPLIT,$-4
+	MOVL	$470, AX		// umtx_wakeup
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·lwp_create(SB),NOSPLIT,$-4
+	MOVL	$495, AX		// lwp_create
+	INT	$0x80
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·lwp_start(SB),NOSPLIT,$0
+
+	// Set GS to point at m->tls.
+	MOVL	mm+0(FP), BX
+	MOVL	m_g0(BX), DX
+	LEAL	m_tls(BX), BP
+	PUSHAL
+	PUSHL	BP
+	CALL	runtime·settls(SB)
+	POPL	AX
+	POPAL
+	
+	// Now segment is established.  Initialize m, g.
+	get_tls(CX)
+	MOVL	BX, g_m(DX)
+	MOVL	DX, g(CX)
+
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	MOVL	0(DX), DX		// paranoia; check they are not nil
+	MOVL	0(BX), BX
+
+	// More paranoia; check that stack splitting code works.
+	PUSHAL
+	CALL	runtime·emptyfunc(SB)
+	POPAL
+
+	CALL	runtime·mstart(SB)
+
+	CALL	runtime·exit1(SB)
+	MOVL	$0x1234, 0x1005
+	RET
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-4
+	MOVL	$1, AX
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$16
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	$0x10000, 4(SP)		// arg 1 - how (EXTEXIT_LWP)
+	MOVL	$0, 8(SP)		// arg 2 - status
+	MOVL	$0, 12(SP)		// arg 3 - addr
+	MOVL	$494, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-4
+	MOVL	$5, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-4
+	MOVL	$6, AX
+	INT	$0x80
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-4
+	MOVL	$3, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-4
+	MOVL	$4, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$-4
+	MOVL	$194, AX
+	INT	$0x80
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	MOVL	$496, AX		// lwp_gettid
+	INT	$0x80
+	MOVL	$0, 0(SP)
+	MOVL	$-1, 4(SP)		// arg 1 - pid
+	MOVL	AX, 8(SP)		// arg 2 - tid
+	MOVL	sig+0(FP), AX
+	MOVL	AX, 8(SP)		// arg 3 - signum
+	MOVL	$497, AX		// lwp_kill
+	INT	$0x80
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$36
+	LEAL	addr+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - addr
+	MOVSL				// arg 2 - len
+	MOVSL				// arg 3 - prot
+	MOVSL				// arg 4 - flags
+	MOVSL				// arg 5 - fd
+	MOVL	$0, AX
+	STOSL				// arg 6 - pad
+	MOVSL				// arg 7 - offset
+	MOVL	$0, AX			// top 32 bits of file offset
+	STOSL
+	MOVL	$197, AX		// sys_mmap
+	INT	$0x80
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$-4
+	MOVL	$73, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$-4
+	MOVL	$75, AX	// madvise
+	INT	$0x80
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·setitimer(SB), NOSPLIT, $-4
+	MOVL	$83, AX
+	INT	$0x80
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	LEAL	12(SP), BX
+	MOVL	$0, 4(SP)	// CLOCK_REALTIME
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	12(SP), AX	// sec
+	MOVL	16(SP), BX	// nsec
+
+	// sec is in AX, nsec in BX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	BX, nsec+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	LEAL	12(SP), BX
+	MOVL	$4, 4(SP)	// CLOCK_MONOTONIC
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	12(SP), AX	// sec
+	MOVL	16(SP), BX	// nsec
+
+	// sec is in AX, nsec in BX
+	// convert to DX:AX nsec
+	MOVL	$1000000000, CX
+	MULL	CX
+	ADDL	BX, AX
+	ADCL	$0, DX
+
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-4
+	MOVL	$342, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$44
+	get_tls(CX)
+
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP 	sigtramp_ret
+
+	// save g
+	MOVL	DI, 20(SP)
+	
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+
+	// copy arguments for call to sighandler
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	info+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	context+8(FP), BX
+	MOVL	BX, 8(SP)
+	MOVL	DI, 12(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+
+sigtramp_ret:
+	// call sigreturn
+	MOVL	context+8(FP), AX
+	MOVL	$0, 0(SP)	// syscall gap
+	MOVL	AX, 4(SP)
+	MOVL	$344, AX	// sigreturn(ucontext)
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$0
+	MOVL	$53, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$20
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVL	AX, 12(SP)		// tv_sec
+	MOVL	$1000, AX
+	MULL	DX
+	MOVL	AX, 16(SP)		// tv_nsec
+
+	MOVL	$0, 0(SP)
+	LEAL	12(SP), AX
+	MOVL	AX, 4(SP)		// arg 1 - rqtp
+	MOVL	$0, 8(SP)		// arg 2 - rmtp
+	MOVL	$240, AX		// sys_nanosleep
+	INT	$0x80
+	RET
+
+TEXT runtime·setldt(SB),NOSPLIT,$4
+	// Under DragonFly we set the GS base instead of messing with the LDT.
+	MOVL	tls0+4(FP), AX
+	MOVL	AX, 0(SP)
+	CALL	runtime·settls(SB)
+	RET
+
+TEXT runtime·settls(SB),NOSPLIT,$24
+	// adjust for ELF: wants to use -8(GS) and -4(GS) for g and m
+	MOVL	tlsbase+0(FP), CX
+	ADDL	$8, CX
+
+	// Set up a struct tls_info - a size of -1 maps the whole address
+	// space and is required for direct-tls access of variable data
+	// via negative offsets.
+	LEAL	16(SP), BX
+	MOVL	CX, 16(SP)		// base
+	MOVL	$-1, 20(SP)		// size
+
+	// set_tls_area returns the descriptor that needs to be loaded into GS.
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	$0, 4(SP)		// arg 1 - which
+	MOVL	BX, 8(SP)		// arg 2 - tls_info
+	MOVL	$8, 12(SP)		// arg 3 - infosize
+	MOVL    $472, AX                // set_tls_area
+	INT     $0x80
+	JCC     2(PC)
+	MOVL    $0xf1, 0xf1             // crash
+	MOVW	AX, GS
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$28
+	LEAL	mib+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - name
+	MOVSL				// arg 2 - namelen
+	MOVSL				// arg 3 - oldp
+	MOVSL				// arg 4 - oldlenp
+	MOVSL				// arg 5 - newp
+	MOVSL				// arg 6 - newlen
+	MOVL	$202, AX		// sys___sysctl
+	INT	$0x80
+	JCC	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVL	$331, AX		// sys_sched_yield
+	INT	$0x80
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$16
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	$3, 4(SP)		// arg 1 - how (SIG_SETMASK)
+	MOVL	new+0(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - set
+	MOVL	old+4(FP), AX
+	MOVL	AX, 12(SP)		// arg 3 - oset
+	MOVL	$340, AX		// sys_sigprocmask
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVL	$362, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	$363, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$32
+	MOVL	$92, AX		// fcntl
+	// 0(SP) is where the caller PC would be; kernel skips it
+	MOVL	fd+0(FP), BX
+	MOVL	BX, 4(SP)	// fd
+	MOVL	$2, 8(SP)	// F_SETFD
+	MOVL	$1, 12(SP)	// FD_CLOEXEC
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	RET
+
+GLOBL runtime·tlsoffset(SB),NOPTR,$4
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
new file mode 100644
index 000000000..2c756018c
--- /dev/null
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -0,0 +1,344 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for AMD64, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+	
+TEXT runtime·sys_umtx_sleep(SB),NOSPLIT,$0
+	MOVQ addr+0(FP), DI		// arg 1 - ptr
+	MOVL val+8(FP), SI		// arg 2 - value
+	MOVL timeout+12(FP), DX		// arg 3 - timeout
+	MOVL $469, AX		// umtx_sleep
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·sys_umtx_wakeup(SB),NOSPLIT,$0
+	MOVQ addr+0(FP), DI		// arg 1 - ptr
+	MOVL val+8(FP), SI		// arg 2 - count
+	MOVL $470, AX		// umtx_wakeup
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·lwp_create(SB),NOSPLIT,$0
+	MOVQ param+0(FP), DI		// arg 1 - params
+	MOVL $495, AX		// lwp_create
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·lwp_start(SB),NOSPLIT,$0
+	MOVQ	DI, R13 // m
+
+	// set up FS to point at m->tls
+	LEAQ	m_tls(R13), DI
+	CALL	runtime·settls(SB)	// smashes DI
+
+	// set up m, g
+	get_tls(CX)
+	MOVQ	m_g0(R13), DI
+	MOVQ	R13, g_m(DI)
+	MOVQ	DI, g(CX)
+
+	CALL	runtime·stackcheck(SB)
+	CALL	runtime·mstart(SB)
+
+	MOVQ 0, AX			// crash (not reached)
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-8
+	MOVL	code+0(FP), DI		// arg 1 exit status
+	MOVL	$1, AX
+	SYSCALL
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-8
+	MOVL	code+0(FP), DI		// arg 1 exit status
+	MOVL	$431, AX
+	SYSCALL
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-8
+	MOVQ	name+0(FP), DI		// arg 1 pathname
+	MOVL	mode+8(FP), SI		// arg 2 flags
+	MOVL	perm+12(FP), DX		// arg 3 mode
+	MOVL	$5, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVL	$6, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$3, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-8
+	MOVQ	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$4, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$-8
+	MOVL	kind+0(FP), DI
+	MOVQ	limit+8(FP), SI
+	MOVL	$194, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	MOVL	$496, AX	// lwp_gettid
+	SYSCALL
+	MOVQ	$-1, DI		// arg 1 - pid
+	MOVQ	8(SP), DI	// arg 2 - tid
+	MOVL	sig+0(FP), SI	// arg 3 - signum
+	MOVL	$497, AX	// lwp_kill
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB), NOSPLIT, $-8
+	MOVL	mode+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	$83, AX
+	SYSCALL
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	MOVQ	$0, DI  	// CLOCK_REALTIME
+	LEAQ	8(SP), SI
+	SYSCALL
+	MOVQ	8(SP), AX	// sec
+	MOVQ	16(SP), DX	// nsec
+
+	// sec is in AX, nsec in DX
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	MOVQ	$4, DI  	// CLOCK_MONOTONIC
+	LEAQ	8(SP), SI
+	SYSCALL
+	MOVQ	8(SP), AX	// sec
+	MOVQ	16(SP), DX	// nsec
+
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-8
+	MOVL	sig+0(FP), DI		// arg 1 sig
+	MOVQ	new+8(FP), SI		// arg 2 act
+	MOVQ	old+16(FP), DX		// arg 3 oact
+	MOVL	$342, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	get_tls(BX)
+
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	5(PC)
+	MOVQ	DI, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVQ	R10, 40(SP)
+	
+	// g = m->signal
+	MOVQ	g_m(R10), BP
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+	
+	MOVQ	DI, 0(SP)
+	MOVQ	SI, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	R10, 24(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVQ	40(SP), R10
+	MOVQ	R10, g(BX)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	prot+16(FP), DX		// arg 3 - prot
+	MOVL	flags+20(FP), R10		// arg 4 - flags
+	MOVL	fd+24(FP), R8		// arg 5 - fd
+	MOVL	off+28(FP), R9
+	SUBQ	$16, SP
+	MOVQ	R9, 8(SP)		// arg 7 - offset (passed on stack)
+	MOVQ	$0, R9			// arg 6 - pad
+	MOVL	$197, AX
+	SYSCALL
+	ADDQ	$16, SP
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 addr
+	MOVQ	n+8(FP), SI		// arg 2 len
+	MOVL	$73, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	flags+16(FP), DX
+	MOVQ	$75, AX	// madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+	
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVQ	new+8(SP), DI
+	MOVQ	old+16(SP), SI
+	MOVQ	$53, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)		// tv_sec
+	MOVL	$1000, AX
+	MULL	DX
+	MOVQ	AX, 8(SP)		// tv_nsec
+
+	MOVQ	SP, DI			// arg 1 - rqtp
+	MOVQ	$0, SI			// arg 2 - rmtp
+	MOVL	$240, AX		// sys_nanosleep
+	SYSCALL
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$16
+	ADDQ	$16, DI	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
+	MOVQ	DI, 0(SP)
+	MOVQ	$16, 8(SP)
+	MOVQ	$0, DI			// arg 1 - which
+	MOVQ	SP, SI			// arg 2 - tls_info
+	MOVQ	$16, DX			// arg 3 - infosize
+	MOVQ	$472, AX		// set_tls_area
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVQ	mib+0(FP), DI		// arg 1 - name
+	MOVL	miblen+8(FP), SI		// arg 2 - namelen
+	MOVQ	out+16(FP), DX		// arg 3 - oldp
+	MOVQ	size+24(FP), R10		// arg 4 - oldlenp
+	MOVQ	dst+32(FP), R8		// arg 5 - newp
+	MOVQ	ndst+40(FP), R9		// arg 6 - newlen
+	MOVQ	$202, AX		// sys___sysctl
+	SYSCALL
+	JCC 4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVL	$331, AX		// sys_sched_yield
+	SYSCALL
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVL	$3, DI			// arg 1 - how (SIG_SETMASK)
+	MOVQ	new+0(FP), SI		// arg 2 - set
+	MOVQ	old+8(FP), DX		// arg 3 - oset
+	MOVL	$340, AX		// sys_sigprocmask
+	SYSCALL
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVQ	$0, DI
+	MOVQ	$0, SI
+	MOVQ	$0, DX
+	MOVL	$362, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI
+	MOVQ	ev1+8(FP), SI
+	MOVL	nev1+16(FP), DX
+	MOVQ	ev2+24(FP), R10
+	MOVL	nev2+32(FP), R8
+	MOVQ	ts+40(FP), R9
+	MOVL	$363, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI	// fd
+	MOVQ	$2, SI		// F_SETFD
+	MOVQ	$1, DX		// FD_CLOEXEC
+	MOVL	$92, AX		// fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s
new file mode 100644
index 000000000..2c40fc433
--- /dev/null
+++ b/src/runtime/sys_freebsd_386.s
@@ -0,0 +1,391 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for 386, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+	
+TEXT runtime·sys_umtx_op(SB),NOSPLIT,$-4
+	MOVL	$454, AX
+	INT	$0x80
+	MOVL	AX, ret+20(FP)
+	RET
+
+TEXT runtime·thr_new(SB),NOSPLIT,$-4
+	MOVL	$455, AX
+	INT	$0x80
+	RET
+
+TEXT runtime·thr_start(SB),NOSPLIT,$0
+	MOVL	mm+0(FP), AX
+	MOVL	m_g0(AX), BX
+	LEAL	m_tls(AX), BP
+	MOVL	0(BP), DI
+	ADDL	$7, DI
+	PUSHAL
+	PUSHL	$32
+	PUSHL	BP
+	PUSHL	DI
+	CALL	runtime·setldt(SB)
+	POPL	AX
+	POPL	AX
+	POPL	AX
+	POPAL
+	get_tls(CX)
+	MOVL	BX, g(CX)
+	
+	MOVL	AX, g_m(BX)
+	CALL	runtime·stackcheck(SB)		// smashes AX
+	CALL	runtime·mstart(SB)
+
+	MOVL	0, AX			// crash (not reached)
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-4
+	MOVL	$1, AX
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-4
+	MOVL	$431, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-4
+	MOVL	$5, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-4
+	MOVL	$6, AX
+	INT	$0x80
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-4
+	MOVL	$3, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-4
+	MOVL	$4, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$-4
+	MOVL	$194, AX
+	INT	$0x80
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	// thr_self(&8(SP))
+	LEAL	8(SP), AX
+	MOVL	AX, 4(SP)
+	MOVL	$432, AX
+	INT	$0x80
+	// thr_kill(self, SIGPIPE)
+	MOVL	8(SP), AX
+	MOVL	AX, 4(SP)
+	MOVL	sig+0(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	$433, AX
+	INT	$0x80
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$32
+	LEAL addr+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL
+	MOVSL
+	MOVSL
+	MOVSL
+	MOVSL
+	MOVSL
+	MOVL	$0, AX	// top 32 bits of file offset
+	STOSL
+	MOVL	$477, AX
+	INT	$0x80
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$-4
+	MOVL	$73, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$-4
+	MOVL	$75, AX	// madvise
+	INT	$0x80
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·setitimer(SB), NOSPLIT, $-4
+	MOVL	$83, AX
+	INT	$0x80
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	LEAL	12(SP), BX
+	MOVL	$0, 4(SP)	// CLOCK_REALTIME
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	12(SP), AX	// sec
+	MOVL	16(SP), BX	// nsec
+
+	// sec is in AX, nsec in BX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	BX, nsec+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	LEAL	12(SP), BX
+	// We can use CLOCK_MONOTONIC_FAST here when we drop
+	// support for FreeBSD 8-STABLE.
+	MOVL	$4, 4(SP)	// CLOCK_MONOTONIC
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	12(SP), AX	// sec
+	MOVL	16(SP), BX	// nsec
+
+	// sec is in AX, nsec in BX
+	// convert to DX:AX nsec
+	MOVL	$1000000000, CX
+	MULL	CX
+	ADDL	BX, AX
+	ADCL	$0, DX
+
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-4
+	MOVL	$416, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$44
+	get_tls(CX)
+
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP 	sigtramp_ret
+
+	// save g
+	MOVL	DI, 20(SP)
+	
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+
+	// copy arguments for call to sighandler
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	info+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	context+8(FP), BX
+	MOVL	BX, 8(SP)
+	MOVL	DI, 12(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+
+sigtramp_ret:
+	// call sigreturn
+	MOVL	context+8(FP), AX
+	MOVL	$0, 0(SP)	// syscall gap
+	MOVL	AX, 4(SP)
+	MOVL	$417, AX	// sigreturn(ucontext)
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$0
+	MOVL	$53, AX
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$20
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVL	AX, 12(SP)		// tv_sec
+	MOVL	$1000, AX
+	MULL	DX
+	MOVL	AX, 16(SP)		// tv_nsec
+
+	MOVL	$0, 0(SP)
+	LEAL	12(SP), AX
+	MOVL	AX, 4(SP)		// arg 1 - rqtp
+	MOVL	$0, 8(SP)		// arg 2 - rmtp
+	MOVL	$240, AX		// sys_nanosleep
+	INT	$0x80
+	RET
+
+/*
+descriptor entry format for system call
+is the native machine format, ugly as it is:
+
+	2-byte limit
+	3-byte base
+	1-byte: 0x80=present, 0x60=dpl<<5, 0x1F=type
+	1-byte: 0x80=limit is *4k, 0x40=32-bit operand size,
+		0x0F=4 more bits of limit
+	1 byte: 8 more bits of base
+
+int i386_get_ldt(int, union ldt_entry *, int);
+int i386_set_ldt(int, const union ldt_entry *, int);
+
+*/
+
+// setldt(int entry, int address, int limit)
+TEXT runtime·setldt(SB),NOSPLIT,$32
+	MOVL	address+4(FP), BX	// aka base
+	// see comment in sys_linux_386.s; freebsd is similar
+	ADDL	$0x8, BX
+
+	// set up data_desc
+	LEAL	16(SP), AX	// struct data_desc
+	MOVL	$0, 0(AX)
+	MOVL	$0, 4(AX)
+
+	MOVW	BX, 2(AX)
+	SHRL	$16, BX
+	MOVB	BX, 4(AX)
+	SHRL	$8, BX
+	MOVB	BX, 7(AX)
+
+	MOVW	$0xffff, 0(AX)
+	MOVB	$0xCF, 6(AX)	// 32-bit operand, 4k limit unit, 4 more bits of limit
+
+	MOVB	$0xF2, 5(AX)	// r/w data descriptor, dpl=3, present
+
+	// call i386_set_ldt(entry, desc, 1)
+	MOVL	$0xffffffff, 0(SP)	// auto-allocate entry and return in AX
+	MOVL	AX, 4(SP)
+	MOVL	$1, 8(SP)
+	CALL	runtime·i386_set_ldt(SB)
+
+	// compute segment selector - (entry*8+7)
+	SHLL	$3, AX
+	ADDL	$7, AX
+	MOVW	AX, GS
+	RET
+
+TEXT runtime·i386_set_ldt(SB),NOSPLIT,$16
+	LEAL	args+0(FP), AX	// 0(FP) == 4(SP) before SP got moved
+	MOVL	$0, 0(SP)	// syscall gap
+	MOVL	$1, 4(SP)
+	MOVL	AX, 8(SP)
+	MOVL	$165, AX
+	INT	$0x80
+	JAE	2(PC)
+	INT	$3
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$28
+	LEAL	mib+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - name
+	MOVSL				// arg 2 - namelen
+	MOVSL				// arg 3 - oldp
+	MOVSL				// arg 4 - oldlenp
+	MOVSL				// arg 5 - newp
+	MOVSL				// arg 6 - newlen
+	MOVL	$202, AX		// sys___sysctl
+	INT	$0x80
+	JAE	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVL	$331, AX		// sys_sched_yield
+	INT	$0x80
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$16
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	$3, 4(SP)		// arg 1 - how (SIG_SETMASK)
+	MOVL	new+0(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - set
+	MOVL	old+4(FP), AX
+	MOVL	AX, 12(SP)		// arg 3 - oset
+	MOVL	$340, AX		// sys_sigprocmask
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVL	$362, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	$363, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$32
+	MOVL	$92, AX		// fcntl
+	// 0(SP) is where the caller PC would be; kernel skips it
+	MOVL	fd+0(FP), BX
+	MOVL	BX, 4(SP)	// fd
+	MOVL	$2, 8(SP)	// F_SETFD
+	MOVL	$1, 12(SP)	// FD_CLOEXEC
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	RET
+
+GLOBL runtime·tlsoffset(SB),NOPTR,$4
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
new file mode 100644
index 000000000..65f8c1a6e
--- /dev/null
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -0,0 +1,357 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for AMD64, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// FreeBSD 8, FreeBSD 9, and older versions that I have checked
+// do not restore R10 on exit from a "restarted" system call
+// if you use the SYSCALL instruction. This means that, for example,
+// if a signal arrives while the wait4 system call is executing,
+// the wait4 internally returns ERESTART, which makes the kernel
+// back up the PC to execute the SYSCALL instruction a second time.
+// However, since the kernel does not restore R10, the fourth
+// argument to the system call has been lost. (FreeBSD 9 also fails
+// to restore the fifth and sixth arguments, R8 and R9, although
+// some earlier versions did restore those correctly.)
+// The broken code is in fast_syscall in FreeBSD's amd64/amd64/exception.S.
+// It restores only DI, SI, DX, AX, and RFLAGS on system call return.
+// http://fxr.watson.org/fxr/source/amd64/amd64/exception.S?v=FREEBSD91#L399
+//
+// The INT $0x80 system call path (int0x80_syscall in FreeBSD's 
+// amd64/ia32/ia32_exception.S) does not have this problem,
+// but it expects the third argument in R10. Instead of rewriting
+// all the assembly in this file, #define SYSCALL to a safe simulation
+// using INT $0x80.
+//
+// INT $0x80 is a little slower than SYSCALL, but correctness wins.
+//
+// See golang.org/issue/6372.
+#define SYSCALL MOVQ R10, CX; INT $0x80
+	
+TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
+	MOVQ addr+0(FP), DI
+	MOVL mode+8(FP), SI
+	MOVL val+12(FP), DX
+	MOVQ ptr2+16(FP), R10
+	MOVQ ts+24(FP), R8
+	MOVL $454, AX
+	SYSCALL
+	MOVL	AX, ret+32(FP)
+	RET
+
+TEXT runtime·thr_new(SB),NOSPLIT,$0
+	MOVQ param+0(FP), DI
+	MOVL size+8(FP), SI
+	MOVL $455, AX
+	SYSCALL
+	RET
+
+TEXT runtime·thr_start(SB),NOSPLIT,$0
+	MOVQ	DI, R13 // m
+
+	// set up FS to point at m->tls
+	LEAQ	m_tls(R13), DI
+	CALL	runtime·settls(SB)	// smashes DI
+
+	// set up m, g
+	get_tls(CX)
+	MOVQ	m_g0(R13), DI
+	MOVQ	R13, g_m(DI)
+	MOVQ	DI, g(CX)
+
+	CALL	runtime·stackcheck(SB)
+	CALL	runtime·mstart(SB)
+
+	MOVQ 0, AX			// crash (not reached)
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-8
+	MOVL	code+0(FP), DI		// arg 1 exit status
+	MOVL	$1, AX
+	SYSCALL
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-8
+	MOVL	code+0(FP), DI		// arg 1 exit status
+	MOVL	$431, AX
+	SYSCALL
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-8
+	MOVQ	name+0(FP), DI		// arg 1 pathname
+	MOVL	mode+8(FP), SI		// arg 2 flags
+	MOVL	perm+12(FP), DX		// arg 3 mode
+	MOVL	$5, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVL	$6, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$3, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-8
+	MOVQ	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$4, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$-8
+	MOVL	kind+0(FP), DI
+	MOVQ	limit+8(FP), SI
+	MOVL	$194, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	// thr_self(&8(SP))
+	LEAQ	8(SP), DI	// arg 1 &8(SP)
+	MOVL	$432, AX
+	SYSCALL
+	// thr_kill(self, SIGPIPE)
+	MOVQ	8(SP), DI	// arg 1 id
+	MOVL	sig+0(FP), SI	// arg 2
+	MOVL	$433, AX
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB), NOSPLIT, $-8
+	MOVL	mode+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	$83, AX
+	SYSCALL
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	MOVQ	$0, DI		// CLOCK_REALTIME
+	LEAQ	8(SP), SI
+	SYSCALL
+	MOVQ	8(SP), AX	// sec
+	MOVQ	16(SP), DX	// nsec
+
+	// sec is in AX, nsec in DX
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	MOVL	$232, AX
+	// We can use CLOCK_MONOTONIC_FAST here when we drop
+	// support for FreeBSD 8-STABLE.
+	MOVQ	$4, DI		// CLOCK_MONOTONIC
+	LEAQ	8(SP), SI
+	SYSCALL
+	MOVQ	8(SP), AX	// sec
+	MOVQ	16(SP), DX	// nsec
+
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-8
+	MOVL	sig+0(FP), DI		// arg 1 sig
+	MOVQ	new+8(FP), SI		// arg 2 act
+	MOVQ	old+16(FP), DX		// arg 3 oact
+	MOVL	$416, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	get_tls(BX)
+
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	5(PC)
+	MOVQ	DI, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVQ	R10, 40(SP)
+	
+	// g = m->signal
+	MOVQ	g_m(R10), BP
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+	
+	MOVQ	DI, 0(SP)
+	MOVQ	SI, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	R10, 24(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVQ	40(SP), R10
+	MOVQ	R10, g(BX)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 addr
+	MOVQ	n+8(FP), SI		// arg 2 len
+	MOVL	prot+16(FP), DX		// arg 3 prot
+	MOVL	flags+20(FP), R10		// arg 4 flags
+	MOVL	fd+24(FP), R8		// arg 5 fid
+	MOVL	off+28(FP), R9		// arg 6 offset
+	MOVL	$477, AX
+	SYSCALL
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 addr
+	MOVQ	n+8(FP), SI		// arg 2 len
+	MOVL	$73, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	flags+16(FP), DX
+	MOVQ	$75, AX	// madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+	
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVQ	new+8(SP), DI
+	MOVQ	old+16(SP), SI
+	MOVQ	$53, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)		// tv_sec
+	MOVL	$1000, AX
+	MULL	DX
+	MOVQ	AX, 8(SP)		// tv_nsec
+
+	MOVQ	SP, DI			// arg 1 - rqtp
+	MOVQ	$0, SI			// arg 2 - rmtp
+	MOVL	$240, AX		// sys_nanosleep
+	SYSCALL
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$8
+	ADDQ	$16, DI	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
+	MOVQ	DI, 0(SP)
+	MOVQ	SP, SI
+	MOVQ	$129, DI	// AMD64_SET_FSBASE
+	MOVQ	$165, AX	// sysarch
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVQ	mib+0(FP), DI		// arg 1 - name
+	MOVL	miblen+8(FP), SI		// arg 2 - namelen
+	MOVQ	out+16(FP), DX		// arg 3 - oldp
+	MOVQ	size+24(FP), R10		// arg 4 - oldlenp
+	MOVQ	dst+32(FP), R8		// arg 5 - newp
+	MOVQ	ndst+40(FP), R9		// arg 6 - newlen
+	MOVQ	$202, AX		// sys___sysctl
+	SYSCALL
+	JCC 4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVL	$331, AX		// sys_sched_yield
+	SYSCALL
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVL	$3, DI			// arg 1 - how (SIG_SETMASK)
+	MOVQ	new+0(FP), SI		// arg 2 - set
+	MOVQ	old+8(FP), DX		// arg 3 - oset
+	MOVL	$340, AX		// sys_sigprocmask
+	SYSCALL
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVQ	$0, DI
+	MOVQ	$0, SI
+	MOVQ	$0, DX
+	MOVL	$362, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI
+	MOVQ	ev1+8(FP), SI
+	MOVL	nev1+16(FP), DX
+	MOVQ	ev2+24(FP), R10
+	MOVL	nev2+32(FP), R8
+	MOVQ	ts+40(FP), R9
+	MOVL	$363, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI	// fd
+	MOVQ	$2, SI		// F_SETFD
+	MOVQ	$1, DX		// FD_CLOEXEC
+	MOVL	$92, AX		// fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
new file mode 100644
index 000000000..d875138b6
--- /dev/null
+++ b/src/runtime/sys_freebsd_arm.s
@@ -0,0 +1,382 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for ARM, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// for EABI, as we don't support OABI
+#define SYS_BASE 0x0
+
+#define SYS_exit (SYS_BASE + 1)
+#define SYS_read (SYS_BASE + 3)
+#define SYS_write (SYS_BASE + 4)
+#define SYS_open (SYS_BASE + 5)
+#define SYS_close (SYS_BASE + 6)
+#define SYS_sigaltstack (SYS_BASE + 53)
+#define SYS_munmap (SYS_BASE + 73)
+#define SYS_madvise (SYS_BASE + 75)
+#define SYS_setitimer (SYS_BASE + 83)
+#define SYS_fcntl (SYS_BASE + 92)
+#define SYS_getrlimit (SYS_BASE + 194)
+#define SYS___sysctl (SYS_BASE + 202)
+#define SYS_nanosleep (SYS_BASE + 240)
+#define SYS_clock_gettime (SYS_BASE + 232)
+#define SYS_sched_yield (SYS_BASE + 331)
+#define SYS_sigprocmask (SYS_BASE + 340)
+#define SYS_kqueue (SYS_BASE + 362)
+#define SYS_kevent (SYS_BASE + 363)
+#define SYS_sigaction (SYS_BASE + 416)
+#define SYS_thr_exit (SYS_BASE + 431)
+#define SYS_thr_self (SYS_BASE + 432)
+#define SYS_thr_kill (SYS_BASE + 433)
+#define SYS__umtx_op (SYS_BASE + 454)
+#define SYS_thr_new (SYS_BASE + 455)
+#define SYS_mmap (SYS_BASE + 477) 
+	
+TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	MOVW 12(FP), R3
+	ADD $20, R13 // arg 5 is passed on stack
+	MOVW $SYS__umtx_op, R7
+	SWI $0
+	SUB $20, R13
+	// BCS error
+	MOVW	R0, ret+20(FP)
+	RET
+
+TEXT runtime·thr_new(SB),NOSPLIT,$0
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW $SYS_thr_new, R7
+	SWI $0
+	RET
+
+TEXT runtime·thr_start(SB),NOSPLIT,$0
+	// set up g
+	MOVW m_g0(R0), g
+	MOVW R0, g_m(g)
+	BL runtime·emptyfunc(SB) // fault if stack check is wrong
+	BL runtime·mstart(SB)
+
+	MOVW $2, R8  // crash (not reached)
+	MOVW R8, (R8)
+	RET
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0	// arg 1 exit status
+	MOVW $SYS_exit, R7
+	SWI $0
+	MOVW.CS $0, R8 // crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0	// arg 1 exit status
+	MOVW $SYS_thr_exit, R7	
+	SWI $0
+	MOVW.CS $0, R8 // crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0	// arg 1 name
+	MOVW 4(FP), R1	// arg 2 mode
+	MOVW 8(FP), R2	// arg 3 perm
+	MOVW $SYS_open, R7
+	SWI $0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0	// arg 1 fd
+	MOVW 4(FP), R1	// arg 2 buf
+	MOVW 8(FP), R2	// arg 3 count
+	MOVW $SYS_read, R7
+	SWI $0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0	// arg 1 fd
+	MOVW 4(FP), R1	// arg 2 buf
+	MOVW 8(FP), R2	// arg 3 count
+	MOVW $SYS_write, R7
+	SWI $0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0	// arg 1 fd
+	MOVW $SYS_close, R7
+	SWI $0
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW $SYS_getrlimit, R7
+	SWI $0
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$8
+	// thr_self(&4(R13))
+	MOVW $4(R13), R0 // arg 1 &4(R13)
+	MOVW $SYS_thr_self, R7
+	SWI $0
+	// thr_kill(self, SIGPIPE)
+	MOVW 4(R13), R0	// arg 1 id
+	MOVW sig+0(FP), R1	// arg 2 - signal
+	MOVW $SYS_thr_kill, R7
+	SWI $0
+	RET
+
+TEXT runtime·setitimer(SB), NOSPLIT, $-8
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	MOVW $SYS_setitimer, R7
+	SWI $0
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVW $0, R0 // CLOCK_REALTIME
+	MOVW $8(R13), R1
+	MOVW $SYS_clock_gettime, R7
+	SWI $0
+
+	MOVW 8(R13), R0 // sec.low
+	MOVW 12(R13), R1 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW R0, 0(FP)
+	MOVW R1, 4(FP)
+	MOVW R2, 8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	// We can use CLOCK_MONOTONIC_FAST here when we drop
+	// support for FreeBSD 8-STABLE.
+	MOVW $4, R0 // CLOCK_MONOTONIC
+	MOVW $8(R13), R1
+	MOVW $SYS_clock_gettime, R7
+	SWI $0
+
+	MOVW 8(R13), R0 // sec.low
+	MOVW 12(R13), R4 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW $1000000000, R3
+	MULLU R0, R3, (R1, R0)
+	MUL R3, R4
+	ADD.S R2, R0
+	ADC R4, R1
+
+	MOVW R0, ret_lo+0(FP)
+	MOVW R1, ret_hi+4(FP)
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0		// arg 1 sig
+	MOVW 4(FP), R1		// arg 2 act
+	MOVW 8(FP), R2		// arg 3 oact
+	MOVW $SYS_sigaction, R7
+	SWI $0
+	MOVW.CS $0, R8 // crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$24
+	// this might be called in external code context,
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
+	MOVW	R0, 4(R13) // signum
+	MOVB	runtime·iscgo(SB), R0
+	CMP 	$0, R0
+	BL.NE	runtime·load_g(SB)
+
+	CMP $0, g
+	BNE 4(PC)
+	// signal number is already prepared in 4(R13)
+	MOVW $runtime·badsignal(SB), R11
+	BL (R11)
+	RET
+
+	// save g
+	MOVW g, R4
+	MOVW g, 20(R13)
+
+	// g = m->signal
+	MOVW g_m(g), R8
+	MOVW m_gsignal(R8), g
+
+	// R0 is already saved
+	MOVW R1, 8(R13) // info
+	MOVW R2, 12(R13) // context
+	MOVW R4, 16(R13) // oldg
+
+	BL runtime·sighandler(SB)
+
+	// restore g
+	MOVW 20(R13), g
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$16
+	MOVW 0(FP), R0		// arg 1 addr
+	MOVW 4(FP), R1		// arg 2 len
+	MOVW 8(FP), R2		// arg 3 prot
+	MOVW 12(FP), R3		// arg 4 flags
+	// arg 5 (fid) and arg6 (offset_lo, offset_hi) are passed on stack
+	// note the C runtime only passes the 32-bit offset_lo to us
+	MOVW 16(FP), R4		// arg 5
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R5		// arg 6 lower 32-bit
+	// the word at 8(R13) is skipped due to 64-bit argument alignment.
+	MOVW R5, 12(R13)
+	MOVW $0, R6 		// higher 32-bit for arg 6
+	MOVW R6, 16(R13)
+	ADD $4, R13
+	MOVW $SYS_mmap, R7
+	SWI $0
+	SUB $4, R13
+	// TODO(dfc) error checking ?
+	MOVW	R0, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVW 0(FP), R0		// arg 1 addr
+	MOVW 4(FP), R1		// arg 2 len
+	MOVW $SYS_munmap, R7
+	SWI $0
+	MOVW.CS $0, R8 // crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVW 0(FP), R0		// arg 1 addr
+	MOVW 4(FP), R1		// arg 2 len
+	MOVW 8(FP), R2		// arg 3 flags
+	MOVW $SYS_madvise, R7
+	SWI $0
+	// ignore failure - maybe pages are locked
+	RET
+	
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVW new+0(FP), R0
+	MOVW old+4(FP), R1
+	MOVW $SYS_sigaltstack, R7
+	SWI $0
+	MOVW.CS $0, R8 // crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVW usec+0(FP), R0
+	MOVW R0, R2
+	MOVW $1000000, R1
+	DIV R1, R0
+	// 0(R13) is the saved LR, don't use it
+	MOVW R0, 4(R13) // tv_sec.low
+	MOVW $0, R0
+	MOVW R0, 8(R13) // tv_sec.high
+	MOD R1, R2
+	MOVW $1000, R1
+	MUL R1, R2
+	MOVW R2, 12(R13) // tv_nsec
+
+	MOVW $4(R13), R0 // arg 1 - rqtp
+	MOVW $0, R1      // arg 2 - rmtp
+	MOVW $SYS_nanosleep, R7
+	SWI $0
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// arg 1 - name
+	MOVW 4(FP), R1	// arg 2 - namelen
+	MOVW 8(FP), R2	// arg 3 - old
+	MOVW 12(FP), R3	// arg 4 - oldlenp
+	// arg 5 (newp) and arg 6 (newlen) are passed on stack
+	ADD $20, R13
+	MOVW $SYS___sysctl, R7
+	SWI $0
+	SUB.CS $0, R0, R0
+	SUB $20, R13
+	MOVW	R0, ret+24(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVW $SYS_sched_yield, R7
+	SWI $0
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVW $3, R0	// arg 1 - how (SIG_SETMASK)
+	MOVW 0(FP), R1	// arg 2 - set
+	MOVW 4(FP), R2	// arg 3 - oset
+	MOVW $SYS_sigprocmask, R7
+	SWI $0
+	MOVW.CS $0, R8 // crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+// int32 runtime·kqueue(void)
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVW $SYS_kqueue, R7
+	SWI $0
+	RSB.CS $0, R0
+	MOVW	R0, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout)
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// kq
+	MOVW 4(FP), R1	// changelist
+	MOVW 8(FP), R2	// nchanges
+	MOVW 12(FP), R3	// eventlist
+	ADD $20, R13	// pass arg 5 and 6 on stack
+	MOVW $SYS_kevent, R7
+	SWI $0
+	RSB.CS $0, R0
+	SUB $20, R13
+	MOVW	R0, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd)
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// fd
+	MOVW $2, R1	// F_SETFD
+	MOVW $1, R2	// FD_CLOEXEC
+	MOVW $SYS_fcntl, R7
+	SWI $0
+	RET
+
+TEXT runtime·casp(SB),NOSPLIT,$0
+	B	runtime·cas(SB)
+
+// TODO(minux): this is only valid for ARMv6+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·cas(SB),NOSPLIT,$0
+	B runtime·armcas(SB)
+
+// TODO(minux): this only supports ARMv6K+.
+TEXT runtime·read_tls_fallback(SB),NOSPLIT,$-4
+	WORD $0xee1d0f70 // mrc p15, 0, r0, c13, c0, 3
+	RET
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
new file mode 100644
index 000000000..0f6d4bbb5
--- /dev/null
+++ b/src/runtime/sys_linux_386.s
@@ -0,0 +1,489 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for 386, Linux
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+TEXT runtime·exit(SB),NOSPLIT,$0
+	MOVL	$252, AX	// syscall number
+	MOVL	code+0(FP), BX
+	CALL	*runtime·_vdso(SB)
+	INT $3	// not reached
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$0
+	MOVL	$1, AX	// exit - exit the current os thread
+	MOVL	code+0(FP), BX
+	CALL	*runtime·_vdso(SB)
+	INT $3	// not reached
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVL	$5, AX		// syscall - open
+	MOVL	name+0(FP), BX
+	MOVL	mode+4(FP), CX
+	MOVL	perm+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVL	$6, AX		// syscall - close
+	MOVL	fd+0(FP), BX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$0
+	MOVL	$4, AX		// syscall - write
+	MOVL	fd+0(FP), BX
+	MOVL	p+4(FP), CX
+	MOVL	n+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0
+	MOVL	$3, AX		// syscall - read
+	MOVL	fd+0(FP), BX
+	MOVL	p+4(FP), CX
+	MOVL	n+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$0
+	MOVL	$191, AX		// syscall - ugetrlimit
+	MOVL	kind+0(FP), BX
+	MOVL	limit+4(FP), CX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$8
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVL	AX, 0(SP)
+	MOVL	DX, 4(SP)
+
+	// select(0, 0, 0, 0, &tv)
+	MOVL	$142, AX
+	MOVL	$0, BX
+	MOVL	$0, CX
+	MOVL	$0, DX
+	MOVL	$0, SI
+	LEAL	0(SP), DI
+	CALL	*runtime·_vdso(SB)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$12
+	MOVL	$224, AX	// syscall - gettid
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, BX	// arg 1 tid
+	MOVL	sig+0(FP), CX	// arg 2 signal
+	MOVL	$238, AX	// syscall - tkill
+	CALL	*runtime·_vdso(SB)
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$0-12
+	MOVL	$104, AX			// syscall - setitimer
+	MOVL	mode+0(FP), BX
+	MOVL	new+4(FP), CX
+	MOVL	old+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT,$0-16
+	MOVL	$218, AX			// syscall - mincore
+	MOVL	addr+0(FP), BX
+	MOVL	n+4(FP), CX
+	MOVL	dst+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+12(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVL	$265, AX			// syscall - clock_gettime
+	MOVL	$0, BX		// CLOCK_REALTIME
+	LEAL	8(SP), CX
+	MOVL	$0, DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	8(SP), AX	// sec
+	MOVL	12(SP), BX	// nsec
+
+	// sec is in AX, nsec in BX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	BX, nsec+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	MOVL	$265, AX			// syscall - clock_gettime
+	MOVL	$1, BX		// CLOCK_MONOTONIC
+	LEAL	8(SP), CX
+	MOVL	$0, DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	8(SP), AX	// sec
+	MOVL	12(SP), BX	// nsec
+
+	// sec is in AX, nsec in BX
+	// convert to DX:AX nsec
+	MOVL	$1000000000, CX
+	MULL	CX
+	ADDL	BX, AX
+	ADCL	$0, DX
+
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0
+	MOVL	$175, AX		// syscall entry
+	MOVL	sig+0(FP), BX
+	MOVL	new+4(FP), CX
+	MOVL	old+8(FP), DX
+	MOVL	size+12(FP), SI
+	CALL	*runtime·_vdso(SB)
+	CMPL	AX, $0xfffff001
+	JLS	2(PC)
+	INT $3
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT,$0
+	MOVL	$174, AX		// syscall - rt_sigaction
+	MOVL	sig+0(FP), BX
+	MOVL	new+4(FP), CX
+	MOVL	old+8(FP), DX
+	MOVL	size+12(FP), SI
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$44
+	get_tls(CX)
+
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	sig+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVL	DI, 20(SP)
+
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+
+	// copy arguments for call to sighandler
+	MOVL	sig+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	info+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	context+8(FP), BX
+	MOVL	BX, 8(SP)
+	MOVL	DI, 12(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+
+	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$0
+	MOVL	$173, AX	// rt_sigreturn
+	// Sigreturn expects same SP as signal handler,
+	// so cannot CALL *runtime._vsdo(SB) here.
+	INT	$0x80
+	INT $3	// not reached
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVL	$192, AX	// mmap2
+	MOVL	addr+0(FP), BX
+	MOVL	n+4(FP), CX
+	MOVL	prot+8(FP), DX
+	MOVL	flags+12(FP), SI
+	MOVL	fd+16(FP), DI
+	MOVL	off+20(FP), BP
+	SHRL	$12, BP
+	CALL	*runtime·_vdso(SB)
+	CMPL	AX, $0xfffff001
+	JLS	3(PC)
+	NOTL	AX
+	INCL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVL	$91, AX	// munmap
+	MOVL	addr+0(FP), BX
+	MOVL	n+4(FP), CX
+	CALL	*runtime·_vdso(SB)
+	CMPL	AX, $0xfffff001
+	JLS	2(PC)
+	INT $3
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVL	$219, AX	// madvise
+	MOVL	addr+0(FP), BX
+	MOVL	n+4(FP), CX
+	MOVL	flags+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	// ignore failure - maybe pages are locked
+	RET
+
+// int32 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT,$0
+	MOVL	$240, AX	// futex
+	MOVL	addr+0(FP), BX
+	MOVL	op+4(FP), CX
+	MOVL	val+8(FP), DX
+	MOVL	ts+12(FP), SI
+	MOVL	addr2+16(FP), DI
+	MOVL	val3+20(FP), BP
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT,$0
+	MOVL	$120, AX	// clone
+	MOVL	flags+4(SP), BX
+	MOVL	stack+8(SP), CX
+	MOVL	$0, DX	// parent tid ptr
+	MOVL	$0, DI	// child tid ptr
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	SUBL	$16, CX
+	MOVL	mm+12(SP), SI
+	MOVL	SI, 0(CX)
+	MOVL	gg+16(SP), SI
+	MOVL	SI, 4(CX)
+	MOVL	fn+20(SP), SI
+	MOVL	SI, 8(CX)
+	MOVL	$1234, 12(CX)
+
+	// cannot use CALL *runtime·_vdso(SB) here, because
+	// the stack changes during the system call (after
+	// CALL *runtime·_vdso(SB), the child is still using
+	// the parent's stack when executing its RET instruction).
+	INT	$0x80
+
+	// In parent, return.
+	CMPL	AX, $0
+	JEQ	3(PC)
+	MOVL	AX, ret+20(FP)
+	RET
+
+	// Paranoia: check that SP is as we expect.
+	MOVL	mm+8(FP), BP
+	CMPL	BP, $1234
+	JEQ	2(PC)
+	INT	$3
+
+	// Initialize AX to Linux tid
+	MOVL	$224, AX
+	CALL	*runtime·_vdso(SB)
+
+	// In child on new stack.  Reload registers (paranoia).
+	MOVL	0(SP), BX	// m
+	MOVL	flags+0(FP), DX	// g
+	MOVL	stk+4(FP), SI	// fn
+
+	MOVL	AX, m_procid(BX)	// save tid as m->procid
+
+	// set up ldt 7+id to point at m->tls.
+	// newosproc left the id in tls[0].
+	LEAL	m_tls(BX), BP
+	MOVL	0(BP), DI
+	ADDL	$7, DI	// m0 is LDT#7. count up.
+	// setldt(tls#, &tls, sizeof tls)
+	PUSHAL	// save registers
+	PUSHL	$32	// sizeof tls
+	PUSHL	BP	// &tls
+	PUSHL	DI	// tls #
+	CALL	runtime·setldt(SB)
+	POPL	AX
+	POPL	AX
+	POPL	AX
+	POPAL
+
+	// Now segment is established.  Initialize m, g.
+	get_tls(AX)
+	MOVL	DX, g(AX)
+	MOVL	BX, g_m(DX)
+
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	MOVL	0(DX), DX	// paranoia; check they are not nil
+	MOVL	0(BX), BX
+
+	// more paranoia; check that stack splitting code works
+	PUSHAL
+	CALL	runtime·emptyfunc(SB)
+	POPAL
+
+	CALL	SI	// fn()
+	CALL	runtime·exit1(SB)
+	MOVL	$0x1234, 0x1005
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVL	$186, AX	// sigaltstack
+	MOVL	new+4(SP), BX
+	MOVL	old+8(SP), CX
+	CALL	*runtime·_vdso(SB)
+	CMPL	AX, $0xfffff001
+	JLS	2(PC)
+	INT	$3
+	RET
+
+// <asm-i386/ldt.h>
+// struct user_desc {
+//	unsigned int  entry_number;
+//	unsigned long base_addr;
+//	unsigned int  limit;
+//	unsigned int  seg_32bit:1;
+//	unsigned int  contents:2;
+//	unsigned int  read_exec_only:1;
+//	unsigned int  limit_in_pages:1;
+//	unsigned int  seg_not_present:1;
+//	unsigned int  useable:1;
+// };
+#define SEG_32BIT 0x01
+// contents are the 2 bits 0x02 and 0x04.
+#define CONTENTS_DATA 0x00
+#define CONTENTS_STACK 0x02
+#define CONTENTS_CODE 0x04
+#define READ_EXEC_ONLY 0x08
+#define LIMIT_IN_PAGES 0x10
+#define SEG_NOT_PRESENT 0x20
+#define USEABLE 0x40
+
+// setldt(int entry, int address, int limit)
+TEXT runtime·setldt(SB),NOSPLIT,$32
+	MOVL	entry+0(FP), BX	// entry
+	MOVL	address+4(FP), CX	// base address
+
+	/*
+	 * When linking against the system libraries,
+	 * we use its pthread_create and let it set up %gs
+	 * for us.  When we do that, the private storage
+	 * we get is not at 0(GS), 4(GS), but -8(GS), -4(GS).
+	 * To insulate the rest of the tool chain from this
+	 * ugliness, 8l rewrites 0(TLS) into -8(GS) for us.
+	 * To accommodate that rewrite, we translate
+	 * the address here and bump the limit to 0xffffffff (no limit)
+	 * so that -8(GS) maps to 0(address).
+	 * Also, the final 0(GS) (current 8(CX)) has to point
+	 * to itself, to mimic ELF.
+	 */
+	ADDL	$0x8, CX	// address
+	MOVL	CX, 0(CX)
+
+	// set up user_desc
+	LEAL	16(SP), AX	// struct user_desc
+	MOVL	BX, 0(AX)
+	MOVL	CX, 4(AX)
+	MOVL	$0xfffff, 8(AX)
+	MOVL	$(SEG_32BIT|LIMIT_IN_PAGES|USEABLE|CONTENTS_DATA), 12(AX)	// flag bits
+
+	// call modify_ldt
+	MOVL	$1, BX	// func = 1 (write)
+	MOVL	AX, CX	// user_desc
+	MOVL	$16, DX	// sizeof(user_desc)
+	MOVL	$123, AX	// syscall - modify_ldt
+	CALL	*runtime·_vdso(SB)
+
+	// breakpoint on error
+	CMPL AX, $0xfffff001
+	JLS 2(PC)
+	INT $3
+
+	// compute segment selector - (entry*8+7)
+	MOVL	entry+0(FP), AX
+	SHLL	$3, AX
+	ADDL	$7, AX
+	MOVW	AX, GS
+
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVL	$158, AX
+	CALL	*runtime·_vdso(SB)
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
+	MOVL	$242, AX		// syscall - sched_getaffinity
+	MOVL	pid+0(FP), BX
+	MOVL	len+4(FP), CX
+	MOVL	buf+8(FP), DX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+12(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size);
+TEXT runtime·epollcreate(SB),NOSPLIT,$0
+	MOVL    $254, AX
+	MOVL	size+0(FP), BX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+4(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags);
+TEXT runtime·epollcreate1(SB),NOSPLIT,$0
+	MOVL    $329, AX
+	MOVL	flags+0(FP), BX
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+4(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT,$0
+	MOVL	$255, AX
+	MOVL	epfd+0(FP), BX
+	MOVL	op+4(FP), CX
+	MOVL	fd+8(FP), DX
+	MOVL	ev+12(FP), SI
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+16(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
+TEXT runtime·epollwait(SB),NOSPLIT,$0
+	MOVL	$256, AX
+	MOVL	epfd+0(FP), BX
+	MOVL	ev+4(FP), CX
+	MOVL	nev+8(FP), DX
+	MOVL	timeout+12(FP), SI
+	CALL	*runtime·_vdso(SB)
+	MOVL	AX, ret+16(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL	$55, AX  // fcntl
+	MOVL	fd+0(FP), BX  // fd
+	MOVL	$2, CX  // F_SETFD
+	MOVL	$1, DX  // FD_CLOEXEC
+	CALL	*runtime·_vdso(SB)
+	RET
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
new file mode 100644
index 000000000..33b91e872
--- /dev/null
+++ b/src/runtime/sys_linux_amd64.s
@@ -0,0 +1,410 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for AMD64, Linux
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+TEXT runtime·exit(SB),NOSPLIT,$0-4
+	MOVL	code+0(FP), DI
+	MOVL	$231, AX	// exitgroup - force all os threads to exit
+	SYSCALL
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$0-4
+	MOVL	code+0(FP), DI
+	MOVL	$60, AX	// exit - exit the current os thread
+	SYSCALL
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0-20
+	MOVQ	name+0(FP), DI
+	MOVL	mode+8(FP), SI
+	MOVL	perm+12(FP), DX
+	MOVL	$2, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0-12
+	MOVL	fd+0(FP), DI
+	MOVL	$3, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$0-28
+	MOVQ	fd+0(FP), DI
+	MOVQ	p+8(FP), SI
+	MOVL	n+16(FP), DX
+	MOVL	$1, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0-28
+	MOVL	fd+0(FP), DI
+	MOVQ	p+8(FP), SI
+	MOVL	n+16(FP), DX
+	MOVL	$0, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$0-20
+	MOVL	kind+0(FP), DI
+	MOVQ	limit+8(FP), SI
+	MOVL	$97, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)
+	MOVQ	DX, 8(SP)
+
+	// select(0, 0, 0, 0, &tv)
+	MOVL	$0, DI
+	MOVL	$0, SI
+	MOVL	$0, DX
+	MOVL	$0, R10
+	MOVQ	SP, R8
+	MOVL	$23, AX
+	SYSCALL
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$0
+	MOVL	$186, AX	// syscall - gettid
+	SYSCALL
+	MOVL	AX, DI	// arg 1 tid
+	MOVL	sig+0(FP), SI	// arg 2
+	MOVL	$200, AX	// syscall - tkill
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$0-24
+	MOVL	mode+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	$38, AX			// syscall entry
+	SYSCALL
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT,$0-28
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	dst+16(FP), DX
+	MOVL	$27, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$16
+	// Be careful. We're calling a function with gcc calling convention here.
+	// We're guaranteed 128 bytes on entry, and we've taken 16, and the
+	// call uses another 8.
+	// That leaves 104 for the gettime code to use. Hope that's enough!
+	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback_gtod
+	MOVL	$0, DI // CLOCK_REALTIME
+	LEAQ	0(SP), SI
+	CALL	AX
+	MOVQ	0(SP), AX	// sec
+	MOVQ	8(SP), DX	// nsec
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+fallback_gtod:
+	LEAQ	0(SP), DI
+	MOVQ	$0, SI
+	MOVQ	runtime·__vdso_gettimeofday_sym(SB), AX
+	CALL	AX
+	MOVQ	0(SP), AX	// sec
+	MOVL	8(SP), DX	// usec
+	IMULQ	$1000, DX
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$16
+	// Duplicate time.now here to avoid using up precious stack space.
+	// See comment above in time.now.
+	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback_gtod_nt
+	MOVL	$1, DI // CLOCK_MONOTONIC
+	LEAQ	0(SP), SI
+	CALL	AX
+	MOVQ	0(SP), AX	// sec
+	MOVQ	8(SP), DX	// nsec
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+fallback_gtod_nt:
+	LEAQ	0(SP), DI
+	MOVQ	$0, SI
+	MOVQ	runtime·__vdso_gettimeofday_sym(SB), AX
+	CALL	AX
+	MOVQ	0(SP), AX	// sec
+	MOVL	8(SP), DX	// usec
+	IMULQ	$1000, DX
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0-28
+	MOVL	sig+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVL	size+24(FP), R10
+	MOVL	$14, AX			// syscall entry
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT,$0-36
+	MOVQ	sig+0(FP), DI
+	MOVQ	new+8(FP), SI
+	MOVQ	old+16(FP), DX
+	MOVQ	size+24(FP), R10
+	MOVL	$13, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+32(FP)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	get_tls(BX)
+
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	5(PC)
+	MOVQ	DI, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVQ	R10, 40(SP)
+
+	// g = m->gsignal
+	MOVQ	g_m(R10), BP
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+
+	MOVQ	DI, 0(SP)
+	MOVQ	SI, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	R10, 24(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVQ	40(SP), R10
+	MOVQ	R10, g(BX)
+	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$0
+	MOVL	$15, AX	// rt_sigreturn
+	SYSCALL
+	INT $3	// not reached
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	prot+16(FP), DX
+	MOVL	flags+20(FP), R10
+	MOVL	fd+24(FP), R8
+	MOVL	off+28(FP), R9
+
+	MOVL	$9, AX			// mmap
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	3(PC)
+	NOTQ	AX
+	INCQ	AX
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	$11, AX	// munmap
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	flags+16(FP), DX
+	MOVQ	$28, AX	// madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+// int64 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI
+	MOVL	op+8(FP), SI
+	MOVL	val+12(FP), DX
+	MOVQ	ts+16(FP), R10
+	MOVQ	addr2+24(FP), R8
+	MOVL	val3+32(FP), R9
+	MOVL	$202, AX
+	SYSCALL
+	MOVL	AX, ret+40(FP)
+	RET
+
+// int32 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT,$0
+	MOVL	flags+8(SP), DI
+	MOVQ	stack+16(SP), SI
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	// Careful: Linux system call clobbers CX and R11.
+	MOVQ	mm+24(SP), R8
+	MOVQ	gg+32(SP), R9
+	MOVQ	fn+40(SP), R12
+
+	MOVL	$56, AX
+	SYSCALL
+
+	// In parent, return.
+	CMPQ	AX, $0
+	JEQ	3(PC)
+	MOVL	AX, ret+40(FP)
+	RET
+
+	// In child, on new stack.
+	MOVQ	SI, SP
+
+	// Initialize m->procid to Linux tid
+	MOVL	$186, AX	// gettid
+	SYSCALL
+	MOVQ	AX, m_procid(R8)
+
+	// Set FS to point at m->tls.
+	LEAQ	m_tls(R8), DI
+	CALL	runtime·settls(SB)
+
+	// In child, set up new stack
+	get_tls(CX)
+	MOVQ	R8, g_m(R9)
+	MOVQ	R9, g(CX)
+	CALL	runtime·stackcheck(SB)
+
+	// Call fn
+	CALL	R12
+
+	// It shouldn't return.  If it does, exit
+	MOVL	$111, DI
+	MOVL	$60, AX
+	SYSCALL
+	JMP	-3(PC)	// keep exiting
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVQ	new+8(SP), DI
+	MOVQ	old+16(SP), SI
+	MOVQ	$131, AX
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$32
+	ADDQ	$16, DI	// ELF wants to use -16(FS), -8(FS)
+
+	MOVQ	DI, SI
+	MOVQ	$0x1002, DI	// ARCH_SET_FS
+	MOVQ	$158, AX	// arch_prctl
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	2(PC)
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVL	$24, AX
+	SYSCALL
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
+	MOVQ	pid+0(FP), DI
+	MOVQ	len+8(FP), SI
+	MOVQ	buf+16(FP), DX
+	MOVL	$204, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size);
+TEXT runtime·epollcreate(SB),NOSPLIT,$0
+	MOVL    size+0(FP), DI
+	MOVL    $213, AX                        // syscall entry
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags);
+TEXT runtime·epollcreate1(SB),NOSPLIT,$0
+	MOVL	flags+0(FP), DI
+	MOVL	$291, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT,$0
+	MOVL	epfd+0(FP), DI
+	MOVL	op+4(FP), SI
+	MOVL	fd+8(FP), DX
+	MOVQ	ev+16(FP), R10
+	MOVL	$233, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
+TEXT runtime·epollwait(SB),NOSPLIT,$0
+	MOVL	epfd+0(FP), DI
+	MOVQ	ev+8(FP), SI
+	MOVL	nev+16(FP), DX
+	MOVL	timeout+20(FP), R10
+	MOVL	$232, AX			// syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $2, SI  // F_SETFD
+	MOVQ    $1, DX  // FD_CLOEXEC
+	MOVL	$72, AX  // fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
new file mode 100644
index 000000000..bd285f399
--- /dev/null
+++ b/src/runtime/sys_linux_arm.s
@@ -0,0 +1,461 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for arm, Linux
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// for EABI, as we don't support OABI
+#define SYS_BASE 0x0
+
+#define SYS_exit (SYS_BASE + 1)
+#define SYS_read (SYS_BASE + 3)
+#define SYS_write (SYS_BASE + 4)
+#define SYS_open (SYS_BASE + 5)
+#define SYS_close (SYS_BASE + 6)
+#define SYS_gettimeofday (SYS_BASE + 78)
+#define SYS_clone (SYS_BASE + 120)
+#define SYS_rt_sigreturn (SYS_BASE + 173)
+#define SYS_rt_sigaction (SYS_BASE + 174)
+#define SYS_rt_sigprocmask (SYS_BASE + 175)
+#define SYS_sigaltstack (SYS_BASE + 186)
+#define SYS_mmap2 (SYS_BASE + 192)
+#define SYS_futex (SYS_BASE + 240)
+#define SYS_exit_group (SYS_BASE + 248)
+#define SYS_munmap (SYS_BASE + 91)
+#define SYS_madvise (SYS_BASE + 220)
+#define SYS_setitimer (SYS_BASE + 104)
+#define SYS_mincore (SYS_BASE + 219)
+#define SYS_gettid (SYS_BASE + 224)
+#define SYS_tkill (SYS_BASE + 238)
+#define SYS_sched_yield (SYS_BASE + 158)
+#define SYS_select (SYS_BASE + 142) // newselect
+#define SYS_ugetrlimit (SYS_BASE + 191)
+#define SYS_sched_getaffinity (SYS_BASE + 242)
+#define SYS_clock_gettime (SYS_BASE + 263)
+#define SYS_epoll_create (SYS_BASE + 250)
+#define SYS_epoll_ctl (SYS_BASE + 251)
+#define SYS_epoll_wait (SYS_BASE + 252)
+#define SYS_epoll_create1 (SYS_BASE + 357)
+#define SYS_fcntl (SYS_BASE + 55)
+
+#define ARM_BASE (SYS_BASE + 0x0f0000)
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_open, R7
+	SWI	$0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	$SYS_close, R7
+	SWI	$0
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_write, R7
+	SWI	$0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_read, R7
+	SWI	$0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	$SYS_ugetrlimit, R7
+	SWI	$0
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·exit(SB),NOSPLIT,$-4
+	MOVW	0(FP), R0
+	MOVW	$SYS_exit_group, R7
+	SWI	$0
+	MOVW	$1234, R0
+	MOVW	$1002, R1
+	MOVW	R0, (R1)	// fail hard
+
+TEXT runtime·exit1(SB),NOSPLIT,$-4
+	MOVW	0(FP), R0
+	MOVW	$SYS_exit, R7
+	SWI	$0
+	MOVW	$1234, R0
+	MOVW	$1003, R1
+	MOVW	R0, (R1)	// fail hard
+
+TEXT	runtime·raise(SB),NOSPLIT,$-4
+	MOVW	$SYS_gettid, R7
+	SWI	$0
+	// arg 1 tid already in R0 from gettid
+	MOVW	sig+0(FP), R1	// arg 2 - signal
+	MOVW	$SYS_tkill, R7
+	SWI	$0
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	12(FP), R3
+	MOVW	16(FP), R4
+	MOVW	20(FP), R5
+	MOVW	$SYS_mmap2, R7
+	SWI	$0
+	MOVW	$0xfffff001, R6
+	CMP		R6, R0
+	RSB.HI	$0, R0
+	MOVW	R0, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	$SYS_munmap, R7
+	SWI	$0
+	MOVW	$0xfffff001, R6
+	CMP 	R6, R0
+	MOVW.HI	$0, R8  // crash on syscall failure
+	MOVW.HI	R8, (R8)
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_madvise, R7
+	SWI	$0
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_setitimer, R7
+	SWI	$0
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_mincore, R7
+	SWI	$0
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT time·now(SB), NOSPLIT, $32
+	MOVW	$0, R0  // CLOCK_REALTIME
+	MOVW	$8(R13), R1  // timespec
+	MOVW	$SYS_clock_gettime, R7
+	SWI	$0
+	
+	MOVW	8(R13), R0  // sec
+	MOVW	12(R13), R2  // nsec
+	
+	MOVW	R0, 0(FP)
+	MOVW	$0, R1
+	MOVW	R1, 4(FP)
+	MOVW	R2, 8(FP)
+	RET	
+
+// int64 nanotime(void)
+TEXT runtime·nanotime(SB),NOSPLIT,$32
+	MOVW	$1, R0  // CLOCK_MONOTONIC
+	MOVW	$8(R13), R1  // timespec
+	MOVW	$SYS_clock_gettime, R7
+	SWI	$0
+	
+	MOVW	8(R13), R0  // sec
+	MOVW	12(R13), R2  // nsec
+	
+	MOVW	$1000000000, R3
+	MULLU	R0, R3, (R1, R0)
+	MOVW	$0, R4
+	ADD.S	R2, R0
+	ADC	R4, R1
+
+	MOVW	R0, ret_lo+0(FP)
+	MOVW	R1, ret_hi+4(FP)
+	RET
+
+// int32 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT,$0
+	MOVW	4(SP), R0
+	MOVW	8(SP), R1
+	MOVW	12(SP), R2
+	MOVW	16(SP), R3
+	MOVW	20(SP), R4
+	MOVW	24(SP), R5
+	MOVW	$SYS_futex, R7
+	SWI	$0
+	MOVW	R0, ret+24(FP)
+	RET
+
+
+// int32 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT,$0
+	MOVW	flags+0(FP), R0
+	MOVW	stk+4(FP), R1
+	MOVW	$0, R2	// parent tid ptr
+	MOVW	$0, R3	// tls_val
+	MOVW	$0, R4	// child tid ptr
+	MOVW	$0, R5
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	// TODO(kaib): figure out which registers are clobbered by clone and avoid stack copying
+	MOVW	$-16(R1), R1
+	MOVW	mm+8(FP), R6
+	MOVW	R6, 0(R1)
+	MOVW	gg+12(FP), R6
+	MOVW	R6, 4(R1)
+	MOVW	fn+16(FP), R6
+	MOVW	R6, 8(R1)
+	MOVW	$1234, R6
+	MOVW	R6, 12(R1)
+
+	MOVW	$SYS_clone, R7
+	SWI	$0
+
+	// In parent, return.
+	CMP	$0, R0
+	BEQ	3(PC)
+	MOVW	R0, ret+20(FP)
+	RET
+
+	// Paranoia: check that SP is as we expect. Use R13 to avoid linker 'fixup'
+	MOVW	12(R13), R0
+	MOVW	$1234, R1
+	CMP	R0, R1
+	BEQ	2(PC)
+	BL	runtime·abort(SB)
+
+	MOVW	4(R13), g
+	MOVW	0(R13), R8
+	MOVW	R8, g_m(g)
+
+	// paranoia; check they are not nil
+	MOVW	0(R8), R0
+	MOVW	0(g), R0
+
+	BL	runtime·emptyfunc(SB)	// fault if stack check is wrong
+
+	// Initialize m->procid to Linux tid
+	MOVW	$SYS_gettid, R7
+	SWI	$0
+	MOVW	g_m(g), R8
+	MOVW	R0, m_procid(R8)
+
+	// Call fn
+	MOVW	8(R13), R0
+	MOVW	$16(R13), R13
+	BL	(R0)
+
+	MOVW	$0, R0
+	MOVW	R0, 4(R13)
+	BL	runtime·exit1(SB)
+
+	// It shouldn't return
+	MOVW	$1234, R0
+	MOVW	$1005, R1
+	MOVW	R0, (R1)
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	$SYS_sigaltstack, R7
+	SWI	$0
+	MOVW	$0xfffff001, R6
+	CMP 	R6, R0
+	MOVW.HI	$0, R8  // crash on syscall failure
+	MOVW.HI	R8, (R8)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$24
+	// this might be called in external code context,
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
+	MOVW	R0, 4(R13)
+	MOVB	runtime·iscgo(SB), R0
+	CMP 	$0, R0
+	BL.NE	runtime·load_g(SB)
+
+	CMP 	$0, g
+	BNE 	4(PC)
+	// signal number is already prepared in 4(R13)
+	MOVW  	$runtime·badsignal(SB), R11
+	BL	(R11)
+	RET
+
+	// save g
+	MOVW	g, R3
+	MOVW	g, 20(R13)
+
+	// g = m->gsignal
+	MOVW	g_m(g), R8
+	MOVW	m_gsignal(R8), g
+
+	// copy arguments for call to sighandler
+	// R0 is already saved above
+	MOVW	R1, 8(R13)
+	MOVW	R2, 12(R13)
+	MOVW	R3, 16(R13)
+
+	BL	runtime·sighandler(SB)
+
+	// restore g
+	MOVW	20(R13), g
+
+	RET
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	12(FP), R3
+	MOVW	$SYS_rt_sigprocmask, R7
+	SWI	$0
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	12(FP), R3
+	MOVW	$SYS_rt_sigaction, R7
+	SWI	$0
+	MOVW	R0, ret+16(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$12
+	MOVW	usec+0(FP), R0
+	MOVW	R0, R1
+	MOVW	$1000000, R2
+	DIV	R2, R0
+	MOD	R2, R1
+	MOVW	R0, 4(SP)
+	MOVW	R1, 8(SP)
+	MOVW	$0, R0
+	MOVW	$0, R1
+	MOVW	$0, R2
+	MOVW	$0, R3
+	MOVW	$4(SP), R4
+	MOVW	$SYS_select, R7
+	SWI	$0
+	RET
+
+// Use kernel version instead of native armcas in asm_arm.s.
+// See ../sync/atomic/asm_linux_arm.s for details.
+TEXT cas<>(SB),NOSPLIT,$0
+	MOVW	$0xffff0fc0, PC
+
+TEXT runtime·cas(SB),NOSPLIT,$0
+	MOVW	ptr+0(FP), R2
+	MOVW	old+4(FP), R0
+casagain:
+	MOVW	new+8(FP), R1
+	BL	cas<>(SB)
+	BCC	cascheck
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+cascheck:
+	// Kernel lies; double-check.
+	MOVW	ptr+0(FP), R2
+	MOVW	old+4(FP), R0
+	MOVW	0(R2), R3
+	CMP	R0, R3
+	BEQ	casagain
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT runtime·casp(SB),NOSPLIT,$0
+	B	runtime·cas(SB)
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVW	$SYS_sched_yield, R7
+	SWI	$0
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_sched_getaffinity, R7
+	SWI	$0
+	MOVW	R0, ret+12(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size)
+TEXT runtime·epollcreate(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	$SYS_epoll_create, R7
+	SWI	$0
+	MOVW	R0, ret+4(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags)
+TEXT runtime·epollcreate1(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	$SYS_epoll_create1, R7
+	SWI	$0
+	MOVW	R0, ret+4(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT,$0
+	MOVW	epfd+0(FP), R0
+	MOVW	op+4(FP), R1
+	MOVW	fd+8(FP), R2
+	MOVW	ev+12(FP), R3
+	MOVW	$SYS_epoll_ctl, R7
+	SWI	$0
+	MOVW	R0, ret+16(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout)
+TEXT runtime·epollwait(SB),NOSPLIT,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	12(FP), R3
+	MOVW	$SYS_epoll_wait, R7
+	SWI	$0
+	MOVW	R0, ret+16(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd)
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVW	0(FP), R0	// fd
+	MOVW	$2, R1	// F_SETFD
+	MOVW	$1, R2	// FD_CLOEXEC
+	MOVW	$SYS_fcntl, R7
+	SWI	$0
+	RET
+
+// b __kuser_get_tls @ 0xffff0fe0
+TEXT runtime·read_tls_fallback(SB),NOSPLIT,$-4
+	MOVW	$0xffff0fe0, R0
+	B	(R0)
diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s
new file mode 100644
index 000000000..47985f31f
--- /dev/null
+++ b/src/runtime/sys_nacl_386.s
@@ -0,0 +1,363 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+#include "syscall_nacl.h"
+
+#define NACL_SYSCALL(code) \
+	MOVL $(0x10000 + ((code)<<5)), AX; CALL AX
+
+TEXT runtime·exit(SB),NOSPLIT,$4
+	MOVL code+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_exit)
+	JMP 0(PC)
+
+TEXT runtime·exit1(SB),NOSPLIT,$4
+	MOVL code+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_thread_exit)
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$12
+	MOVL name+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL mode+4(FP), AX
+	MOVL AX, 4(SP)
+	MOVL perm+8(FP), AX
+	MOVL AX, 8(SP)
+	NACL_SYSCALL(SYS_open)
+	MOVL AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$4
+	MOVL fd+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_close)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$12
+	MOVL fd+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL p+4(FP), AX
+	MOVL AX, 4(SP)
+	MOVL n+8(FP), AX
+	MOVL AX, 8(SP)
+	NACL_SYSCALL(SYS_read)
+	MOVL AX, ret+12(FP)
+	RET
+
+TEXT syscall·naclWrite(SB), NOSPLIT, $16-16
+	MOVL arg1+0(FP), DI
+	MOVL arg2+4(FP), SI
+	MOVL arg3+8(FP), DX
+	MOVL DI, 0(SP)
+	MOVL SI, 4(SP)
+	MOVL DX, 8(SP)
+	CALL runtime·write(SB)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$12
+	MOVL fd+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL p+4(FP), AX
+	MOVL AX, 4(SP)
+	MOVL n+8(FP), AX
+	MOVL AX, 8(SP)
+	NACL_SYSCALL(SYS_write)
+	MOVL AX, ret+12(FP)
+	RET
+
+TEXT runtime·nacl_exception_stack(SB),NOSPLIT,$8
+	MOVL p+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL size+4(FP), AX
+	MOVL AX, 4(SP)
+	NACL_SYSCALL(SYS_exception_stack)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_exception_handler(SB),NOSPLIT,$8
+	MOVL fn+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL arg+4(FP), AX
+	MOVL AX, 4(SP)
+	NACL_SYSCALL(SYS_exception_handler)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_sem_create(SB),NOSPLIT,$4
+	MOVL flag+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_sem_create)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_sem_wait(SB),NOSPLIT,$4
+	MOVL sem+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_sem_wait)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_sem_post(SB),NOSPLIT,$4
+	MOVL sem+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_sem_post)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_create(SB),NOSPLIT,$4
+	MOVL flag+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_mutex_create)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_lock(SB),NOSPLIT,$4
+	MOVL mutex+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_mutex_lock)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_trylock(SB),NOSPLIT,$4
+	MOVL mutex+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_mutex_trylock)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_unlock(SB),NOSPLIT,$4
+	MOVL mutex+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_mutex_unlock)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_create(SB),NOSPLIT,$4
+	MOVL flag+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_cond_create)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_wait(SB),NOSPLIT,$8
+	MOVL cond+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL n+4(FP), AX
+	MOVL AX, 4(SP)
+	NACL_SYSCALL(SYS_cond_wait)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_signal(SB),NOSPLIT,$4
+	MOVL cond+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_cond_signal)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_broadcast(SB),NOSPLIT,$4
+	MOVL cond+0(FP), AX
+	MOVL AX, 0(SP)
+	NACL_SYSCALL(SYS_cond_broadcast)
+	MOVL AX, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_timed_wait_abs(SB),NOSPLIT,$12
+	MOVL cond+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL lock+4(FP), AX
+	MOVL AX, 4(SP)
+	MOVL ts+8(FP), AX
+	MOVL AX, 8(SP)
+	NACL_SYSCALL(SYS_cond_timed_wait_abs)
+	MOVL AX, ret+12(FP)
+	RET
+
+TEXT runtime·nacl_thread_create(SB),NOSPLIT,$16
+	MOVL fn+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL stk+4(FP), AX
+	MOVL AX, 4(SP)
+	MOVL tls+8(FP), AX
+	MOVL AX, 8(SP)
+	MOVL xx+12(FP), AX
+	MOVL AX, 12(SP)
+	NACL_SYSCALL(SYS_thread_create)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT runtime·mstart_nacl(SB),NOSPLIT,$0
+	JMP runtime·mstart(SB)
+
+TEXT runtime·nacl_nanosleep(SB),NOSPLIT,$8
+	MOVL ts+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL extra+4(FP), AX
+	MOVL AX, 4(SP)
+	NACL_SYSCALL(SYS_nanosleep)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	NACL_SYSCALL(SYS_sched_yield)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$32
+	MOVL	addr+0(FP), AX
+	MOVL	AX, 0(SP)
+	MOVL	n+4(FP), AX
+	MOVL	AX, 4(SP)
+	MOVL	prot+8(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	flags+12(FP), AX
+	MOVL	AX, 12(SP)
+	MOVL	fd+16(FP), AX
+	MOVL	AX, 16(SP)
+	MOVL	off+20(FP), AX
+	MOVL	AX, 24(SP)
+	MOVL	$0, 28(SP)
+	LEAL	24(SP), AX
+	MOVL	AX, 20(SP)
+	NACL_SYSCALL(SYS_mmap)
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT time·now(SB),NOSPLIT,$20
+	MOVL $0, 0(SP) // real time clock
+	LEAL 8(SP), AX
+	MOVL AX, 4(SP) // timespec
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVL 8(SP), AX // low 32 sec
+	MOVL 12(SP), CX // high 32 sec
+	MOVL 16(SP), BX // nsec
+
+	// sec is in AX, nsec in BX
+	MOVL	AX, sec+0(FP)
+	MOVL	CX, sec+4(FP)
+	MOVL	BX, nsec+8(FP)
+	RET
+
+TEXT syscall·now(SB),NOSPLIT,$0
+	JMP time·now(SB)
+
+TEXT runtime·nacl_clock_gettime(SB),NOSPLIT,$8
+	MOVL arg1+0(FP), AX
+	MOVL AX, 0(SP)
+	MOVL arg2+4(FP), AX
+	MOVL AX, 4(SP)
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVL AX, ret+8(FP)
+	RET
+	
+TEXT runtime·nanotime(SB),NOSPLIT,$20
+	MOVL $0, 0(SP) // real time clock
+	LEAL 8(SP), AX
+	MOVL AX, 4(SP) // timespec
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVL 8(SP), AX // low 32 sec
+	MOVL 16(SP), BX // nsec
+
+	// sec is in AX, nsec in BX
+	// convert to DX:AX nsec
+	MOVL	$1000000000, CX
+	MULL	CX
+	ADDL	BX, AX
+	ADCL	$0, DX
+
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+TEXT runtime·setldt(SB),NOSPLIT,$8
+	MOVL	addr+4(FP), BX // aka base
+	ADDL	$0x8, BX
+	MOVL	BX, 0(SP)
+	NACL_SYSCALL(SYS_tls_init)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	get_tls(CX)
+
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	$11, BX
+	MOVL	$0, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP 	sigtramp_ret
+
+	// save g
+	MOVL	DI, 20(SP)
+	
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+	
+	// copy arguments for sighandler
+	MOVL	$11, 0(SP) // signal
+	MOVL	$0, 4(SP) // siginfo
+	LEAL	ctxt+4(FP), AX
+	MOVL	AX, 8(SP) // context
+	MOVL	DI, 12(SP) // g
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+
+sigtramp_ret:
+	// Enable exceptions again.
+	NACL_SYSCALL(SYS_exception_clear_flag)
+
+	// NaCl has abidcated its traditional operating system responsibility
+	// and declined to implement 'sigreturn'. Instead the only way to return
+	// to the execution of our program is to restore the registers ourselves.
+	// Unfortunately, that is impossible to do with strict fidelity, because
+	// there is no way to do the final update of PC that ends the sequence
+	// without either (1) jumping to a register, in which case the register ends
+	// holding the PC value instead of its intended value or (2) storing the PC
+	// on the stack and using RET, which imposes the requirement that SP is
+	// valid and that is okay to smash the word below it. The second would
+	// normally be the lesser of the two evils, except that on NaCl, the linker
+	// must rewrite RET into "POP reg; AND $~31, reg; JMP reg", so either way
+	// we are going to lose a register as a result of the incoming signal.
+	// Similarly, there is no way to restore EFLAGS; the usual way is to use
+	// POPFL, but NaCl rejects that instruction. We could inspect the bits and
+	// execute a sequence of instructions designed to recreate those flag
+	// settings, but that's a lot of work.
+	//
+	// Thankfully, Go's signal handlers never try to return directly to the
+	// executing code, so all the registers and EFLAGS are dead and can be
+	// smashed. The only registers that matter are the ones that are setting
+	// up for the simulated call that the signal handler has created.
+	// Today those registers are just PC and SP, but in case additional registers
+	// are relevant in the future (for example DX is the Go func context register)
+	// we restore as many registers as possible.
+	// 
+	// We smash BP, because that's what the linker smashes during RET.
+	//
+	LEAL	ctxt+4(FP), BP
+	ADDL	$64, BP
+	MOVL	0(BP), AX
+	MOVL	4(BP), CX
+	MOVL	8(BP), DX
+	MOVL	12(BP), BX
+	MOVL	16(BP), SP
+	// 20(BP) is saved BP, never to be seen again
+	MOVL	24(BP), SI
+	MOVL	28(BP), DI
+	// 36(BP) is saved EFLAGS, never to be seen again
+	MOVL	32(BP), BP // saved PC
+	JMP	BP
diff --git a/src/runtime/sys_nacl_amd64p32.s b/src/runtime/sys_nacl_amd64p32.s
new file mode 100644
index 000000000..4eb4aacdd
--- /dev/null
+++ b/src/runtime/sys_nacl_amd64p32.s
@@ -0,0 +1,459 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+#include "syscall_nacl.h"
+
+#define NACL_SYSCALL(code) \
+	MOVL $(0x10000 + ((code)<<5)), AX; CALL AX
+
+TEXT runtime·settls(SB),NOSPLIT,$0
+	MOVL	DI, TLS // really BP
+	RET
+
+TEXT runtime·exit(SB),NOSPLIT,$0
+	MOVL code+0(FP), DI
+	NACL_SYSCALL(SYS_exit)
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$0
+	MOVL code+0(FP), DI
+	NACL_SYSCALL(SYS_thread_exit)
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVL name+0(FP), DI
+	MOVL mode+4(FP), SI
+	MOVL perm+8(FP), DX
+	NACL_SYSCALL(SYS_open)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVL fd+0(FP), DI
+	NACL_SYSCALL(SYS_close)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0
+	MOVL fd+0(FP), DI
+	MOVL p+4(FP), SI
+	MOVL n+8(FP), DX
+	NACL_SYSCALL(SYS_read)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT syscall·naclWrite(SB), NOSPLIT, $24-20
+	MOVL arg1+0(FP), DI
+	MOVL arg2+4(FP), SI
+	MOVL arg3+8(FP), DX
+	MOVL DI, 0(SP)
+	MOVL SI, 4(SP)
+	MOVL DX, 8(SP)
+	CALL runtime·write(SB)
+	MOVL 16(SP), AX
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$16-20
+	// If using fake time and writing to stdout or stderr,
+	// emit playback header before actual data.
+	MOVQ runtime·faketime(SB), AX
+	CMPQ AX, $0
+	JEQ write
+	MOVL fd+0(FP), DI
+	CMPL DI, $1
+	JEQ playback
+	CMPL DI, $2
+	JEQ playback
+
+write:
+	// Ordinary write.
+	MOVL fd+0(FP), DI
+	MOVL p+4(FP), SI
+	MOVL n+8(FP), DX
+	NACL_SYSCALL(SYS_write)
+	MOVL	AX, ret+16(FP)
+	RET
+
+	// Write with playback header.
+	// First, lock to avoid interleaving writes.
+playback:
+	MOVL $1, BX
+	XCHGL	runtime·writelock(SB), BX
+	CMPL BX, $0
+	JNE playback
+
+	// Playback header: 0 0 P B <8-byte time> <4-byte data length>
+	MOVL $(('B'<<24) | ('P'<<16)), 0(SP)
+	BSWAPQ AX
+	MOVQ AX, 4(SP)
+	MOVL n+8(FP), DX
+	BSWAPL DX
+	MOVL DX, 12(SP)
+	MOVL $1, DI // standard output
+	MOVL SP, SI
+	MOVL $16, DX
+	NACL_SYSCALL(SYS_write)
+
+	// Write actual data.
+	MOVL $1, DI // standard output
+	MOVL p+4(FP), SI
+	MOVL n+8(FP), DX
+	NACL_SYSCALL(SYS_write)
+
+	// Unlock.
+	MOVL	$0, runtime·writelock(SB)
+
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·nacl_exception_stack(SB),NOSPLIT,$0
+	MOVL p+0(FP), DI
+	MOVL size+4(FP), SI
+	NACL_SYSCALL(SYS_exception_stack)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_exception_handler(SB),NOSPLIT,$0
+	MOVL fn+0(FP), DI
+	MOVL arg+4(FP), SI
+	NACL_SYSCALL(SYS_exception_handler)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_sem_create(SB),NOSPLIT,$0
+	MOVL flag+0(FP), DI
+	NACL_SYSCALL(SYS_sem_create)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_sem_wait(SB),NOSPLIT,$0
+	MOVL sem+0(FP), DI
+	NACL_SYSCALL(SYS_sem_wait)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_sem_post(SB),NOSPLIT,$0
+	MOVL sem+0(FP), DI
+	NACL_SYSCALL(SYS_sem_post)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_mutex_create(SB),NOSPLIT,$0
+	MOVL flag+0(FP), DI
+	NACL_SYSCALL(SYS_mutex_create)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_mutex_lock(SB),NOSPLIT,$0
+	MOVL mutex+0(FP), DI
+	NACL_SYSCALL(SYS_mutex_lock)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_mutex_trylock(SB),NOSPLIT,$0
+	MOVL mutex+0(FP), DI
+	NACL_SYSCALL(SYS_mutex_trylock)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_mutex_unlock(SB),NOSPLIT,$0
+	MOVL mutex+0(FP), DI
+	NACL_SYSCALL(SYS_mutex_unlock)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_create(SB),NOSPLIT,$0
+	MOVL flag+0(FP), DI
+	NACL_SYSCALL(SYS_cond_create)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_wait(SB),NOSPLIT,$0
+	MOVL cond+0(FP), DI
+	MOVL n+4(FP), SI
+	NACL_SYSCALL(SYS_cond_wait)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_signal(SB),NOSPLIT,$0
+	MOVL cond+0(FP), DI
+	NACL_SYSCALL(SYS_cond_signal)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_broadcast(SB),NOSPLIT,$0
+	MOVL cond+0(FP), DI
+	NACL_SYSCALL(SYS_cond_broadcast)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_timed_wait_abs(SB),NOSPLIT,$0
+	MOVL cond+0(FP), DI
+	MOVL lock+4(FP), SI
+	MOVL ts+8(FP), DX
+	NACL_SYSCALL(SYS_cond_timed_wait_abs)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT runtime·nacl_thread_create(SB),NOSPLIT,$0
+	MOVL fn+0(FP), DI
+	MOVL stk+4(FP), SI
+	MOVL tls+8(FP), DX
+	MOVL xx+12(FP), CX
+	NACL_SYSCALL(SYS_thread_create)
+	MOVL AX, ret+16(FP)
+	RET
+
+TEXT runtime·mstart_nacl(SB),NOSPLIT,$0
+	NACL_SYSCALL(SYS_tls_get)
+	SUBL	$8, AX
+	MOVL	AX, TLS
+	JMP runtime·mstart(SB)
+
+TEXT runtime·nacl_nanosleep(SB),NOSPLIT,$0
+	MOVL ts+0(FP), DI
+	MOVL extra+4(FP), SI
+	NACL_SYSCALL(SYS_nanosleep)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	NACL_SYSCALL(SYS_sched_yield)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$8
+	MOVL addr+0(FP), DI
+	MOVL n+4(FP), SI
+	MOVL prot+8(FP), DX
+	MOVL flags+12(FP), CX
+	MOVL fd+16(FP), R8
+	MOVL off+20(FP), AX
+	MOVQ AX, 0(SP)
+	MOVL SP, R9
+	NACL_SYSCALL(SYS_mmap)
+	CMPL AX, $-4095
+	JNA 2(PC)
+	NEGL AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT time·now(SB),NOSPLIT,$16
+	MOVQ runtime·faketime(SB), AX
+	CMPQ AX, $0
+	JEQ realtime
+	MOVQ $0, DX
+	MOVQ $1000000000, CX
+	DIVQ CX
+	MOVQ AX, sec+0(FP)
+	MOVL DX, nsec+8(FP)
+	RET
+realtime:
+	MOVL $0, DI // real time clock
+	LEAL 0(SP), AX
+	MOVL AX, SI // timespec
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVL 0(SP), AX // low 32 sec
+	MOVL 4(SP), CX // high 32 sec
+	MOVL 8(SP), BX // nsec
+
+	// sec is in AX, nsec in BX
+	MOVL	AX, sec+0(FP)
+	MOVL	CX, sec+4(FP)
+	MOVL	BX, nsec+8(FP)
+	RET
+
+TEXT syscall·now(SB),NOSPLIT,$0
+	JMP time·now(SB)
+
+TEXT runtime·nacl_clock_gettime(SB),NOSPLIT,$0
+	MOVL arg1+0(FP), DI
+	MOVL arg2+4(FP), SI
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVL AX, ret+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$16
+	MOVQ runtime·faketime(SB), AX
+	CMPQ AX, $0
+	JEQ 3(PC)
+	MOVQ	AX, ret+0(FP)
+	RET
+	MOVL $0, DI // real time clock
+	LEAL 0(SP), AX
+	MOVL AX, SI // timespec
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVQ 0(SP), AX // sec
+	MOVL 8(SP), DX // nsec
+
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$80
+	// restore TLS register at time of execution,
+	// in case it's been smashed.
+	// the TLS register is really BP, but for consistency
+	// with non-NaCl systems it is referred to here as TLS.
+	// NOTE: Cannot use SYS_tls_get here (like we do in mstart_nacl),
+	// because the main thread never calls tls_set.
+	LEAL ctxt+0(FP), AX
+	MOVL (16*4+5*8)(AX), AX
+	MOVL	AX, TLS
+
+	// check that g exists
+	get_tls(CX)
+	MOVL	g(CX), DI
+	
+	CMPL	DI, $0
+	JEQ	nog
+
+	// save g
+	MOVL	DI, 20(SP)
+	
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+
+//JMP debughandler
+
+	// copy arguments for sighandler
+	MOVL	$11, 0(SP) // signal
+	MOVL	$0, 4(SP) // siginfo
+	LEAL	ctxt+0(FP), AX
+	MOVL	AX, 8(SP) // context
+	MOVL	DI, 12(SP) // g
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+
+sigtramp_ret:
+	// Enable exceptions again.
+	NACL_SYSCALL(SYS_exception_clear_flag)
+
+	// Restore registers as best we can. Impossible to do perfectly.
+	// See comment in sys_nacl_386.s for extended rationale.
+	LEAL	ctxt+0(FP), SI
+	ADDL	$64, SI
+	MOVQ	0(SI), AX
+	MOVQ	8(SI), CX
+	MOVQ	16(SI), DX
+	MOVQ	24(SI), BX
+	MOVL	32(SI), SP	// MOVL for SP sandboxing
+	// 40(SI) is saved BP aka TLS, already restored above
+	// 48(SI) is saved SI, never to be seen again
+	MOVQ	56(SI), DI
+	MOVQ	64(SI), R8
+	MOVQ	72(SI), R9
+	MOVQ	80(SI), R10
+	MOVQ	88(SI), R11
+	MOVQ	96(SI), R12
+	MOVQ	104(SI), R13
+	MOVQ	112(SI), R14
+	// 120(SI) is R15, which is owned by Native Client and must not be modified
+	MOVQ	128(SI), SI // saved PC
+	// 136(SI) is saved EFLAGS, never to be seen again
+	JMP	SI
+
+debughandler:
+	// print basic information
+	LEAL	ctxt+0(FP), DI
+	MOVL	$runtime·sigtrampf(SB), AX
+	MOVL	AX, 0(SP)
+	MOVQ	(16*4+16*8)(DI), BX // rip
+	MOVQ	BX, 8(SP)
+	MOVQ	(16*4+0*8)(DI), BX // rax
+	MOVQ	BX, 16(SP)
+	MOVQ	(16*4+1*8)(DI), BX // rcx
+	MOVQ	BX, 24(SP)
+	MOVQ	(16*4+2*8)(DI), BX // rdx
+	MOVQ	BX, 32(SP)
+	MOVQ	(16*4+3*8)(DI), BX // rbx
+	MOVQ	BX, 40(SP)
+	MOVQ	(16*4+7*8)(DI), BX // rdi
+	MOVQ	BX, 48(SP)
+	MOVQ	(16*4+15*8)(DI), BX // r15
+	MOVQ	BX, 56(SP)
+	MOVQ	(16*4+4*8)(DI), BX // rsp
+	MOVQ	0(BX), BX
+	MOVQ	BX, 64(SP)
+	CALL	runtime·printf(SB)
+	
+	LEAL	ctxt+0(FP), DI
+	MOVQ	(16*4+16*8)(DI), BX // rip
+	MOVL	BX, 0(SP)
+	MOVQ	(16*4+4*8)(DI), BX // rsp
+	MOVL	BX, 4(SP)
+	MOVL	$0, 8(SP)	// lr
+	get_tls(CX)
+	MOVL	g(CX), BX
+	MOVL	BX, 12(SP)	// gp
+	CALL	runtime·traceback(SB)
+
+notls:
+	MOVL	0, AX
+	RET
+
+nog:
+	MOVL	0, AX
+	RET
+
+// cannot do real signal handling yet, because gsignal has not been allocated.
+MOVL $1, DI; NACL_SYSCALL(SYS_exit)
+
+TEXT runtime·nacl_sysinfo(SB),NOSPLIT,$16
+/*
+	MOVL	di+0(FP), DI
+	LEAL	12(DI), BX
+	MOVL	8(DI), AX
+	ADDL	4(DI), AX
+	ADDL	$2, AX
+	LEAL	(BX)(AX*4), BX
+	MOVL	BX, runtime·nacl_irt_query(SB)
+auxloop:
+	MOVL	0(BX), DX
+	CMPL	DX, $0
+	JNE	2(PC)
+	RET
+	CMPL	DX, $32
+	JEQ	auxfound
+	ADDL	$8, BX
+	JMP	auxloop
+auxfound:
+	MOVL	4(BX), BX
+	MOVL	BX, runtime·nacl_irt_query(SB)
+
+	LEAL	runtime·nacl_irt_basic_v0_1_str(SB), DI
+	LEAL	runtime·nacl_irt_basic_v0_1(SB), SI
+	MOVL	runtime·nacl_irt_basic_v0_1_size(SB), DX
+	MOVL	runtime·nacl_irt_query(SB), BX
+	CALL	BX
+
+	LEAL	runtime·nacl_irt_memory_v0_3_str(SB), DI
+	LEAL	runtime·nacl_irt_memory_v0_3(SB), SI
+	MOVL	runtime·nacl_irt_memory_v0_3_size(SB), DX
+	MOVL	runtime·nacl_irt_query(SB), BX
+	CALL	BX
+
+	LEAL	runtime·nacl_irt_thread_v0_1_str(SB), DI
+	LEAL	runtime·nacl_irt_thread_v0_1(SB), SI
+	MOVL	runtime·nacl_irt_thread_v0_1_size(SB), DX
+	MOVL	runtime·nacl_irt_query(SB), BX
+	CALL	BX
+
+	// TODO: Once we have a NaCl SDK with futex syscall support,
+	// try switching to futex syscalls and here load the
+	// nacl-irt-futex-0.1 table.
+*/
+	RET
diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s
new file mode 100644
index 000000000..d354ab483
--- /dev/null
+++ b/src/runtime/sys_nacl_arm.s
@@ -0,0 +1,320 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+#include "syscall_nacl.h"
+
+#define NACL_SYSCALL(code) \
+	MOVW	$(0x10000 + ((code)<<5)), R8; BL (R8)
+
+TEXT runtime·exit(SB),NOSPLIT,$0
+	MOVW	code+0(FP), R0
+	NACL_SYSCALL(SYS_exit)
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$0
+	MOVW	code+0(FP), R0
+	NACL_SYSCALL(SYS_thread_exit)
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVW	name+0(FP), R0
+	MOVW	name+0(FP), R1
+	MOVW	name+0(FP), R2
+	NACL_SYSCALL(SYS_open)
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVW	fd+0(FP), R0
+	NACL_SYSCALL(SYS_close)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$0
+	MOVW	fd+0(FP), R0
+	MOVW	p+4(FP), R1
+	MOVW	n+8(FP), R2
+	NACL_SYSCALL(SYS_read)
+	MOVW	R0, ret+12(FP)
+	RET
+
+// func naclWrite(fd int, b []byte) int
+TEXT syscall·naclWrite(SB),NOSPLIT,$0
+	MOVW	arg1+0(FP), R0
+	MOVW	arg2+4(FP), R1
+	MOVW	arg3+8(FP), R2
+	NACL_SYSCALL(SYS_write)
+	MOVW	R0, ret+16(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$0
+	MOVW	fd+0(FP), R0
+	MOVW	p+4(FP), R1
+	MOVW	n+8(FP), R2
+	NACL_SYSCALL(SYS_write)
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·nacl_exception_stack(SB),NOSPLIT,$0
+	MOVW	p+0(FP), R0
+	MOVW	size+4(FP), R1
+	NACL_SYSCALL(SYS_exception_stack)
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_exception_handler(SB),NOSPLIT,$0
+	MOVW	fn+0(FP), R0
+	MOVW	arg+4(FP), R1
+	NACL_SYSCALL(SYS_exception_handler)
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_sem_create(SB),NOSPLIT,$0
+	MOVW	flag+0(FP), R0
+	NACL_SYSCALL(SYS_sem_create)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_sem_wait(SB),NOSPLIT,$0
+	MOVW	sem+0(FP), R0
+	NACL_SYSCALL(SYS_sem_wait)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_sem_post(SB),NOSPLIT,$0
+	MOVW	sem+0(FP), R0
+	NACL_SYSCALL(SYS_sem_post)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_create(SB),NOSPLIT,$0
+	MOVW	flag+0(FP), R0
+	NACL_SYSCALL(SYS_mutex_create)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_lock(SB),NOSPLIT,$0
+	MOVW	mutex+0(FP), R0
+	NACL_SYSCALL(SYS_mutex_lock)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_trylock(SB),NOSPLIT,$0
+	MOVW	mutex+0(FP), R0
+	NACL_SYSCALL(SYS_mutex_trylock)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_mutex_unlock(SB),NOSPLIT,$0
+	MOVW	mutex+0(FP), R0
+	NACL_SYSCALL(SYS_mutex_unlock)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_create(SB),NOSPLIT,$0
+	MOVW	flag+0(FP), R0
+	NACL_SYSCALL(SYS_cond_create)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_wait(SB),NOSPLIT,$0
+	MOVW	cond+0(FP), R0
+	MOVW	n+4(FP), R1
+	NACL_SYSCALL(SYS_cond_wait)
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·nacl_cond_signal(SB),NOSPLIT,$0
+	MOVW	cond+0(FP), R0
+	NACL_SYSCALL(SYS_cond_signal)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_broadcast(SB),NOSPLIT,$0
+	MOVW	cond+0(FP), R0
+	NACL_SYSCALL(SYS_cond_broadcast)
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·nacl_cond_timed_wait_abs(SB),NOSPLIT,$0
+	MOVW	cond+0(FP), R0
+	MOVW	lock+4(FP), R1
+	MOVW	ts+8(FP), R2
+	NACL_SYSCALL(SYS_cond_timed_wait_abs)
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·nacl_thread_create(SB),NOSPLIT,$0
+	MOVW	fn+0(FP), R0
+	MOVW	stk+4(FP), R1
+	MOVW	tls+8(FP), R2
+	MOVW	xx+12(FP), R3
+	NACL_SYSCALL(SYS_thread_create)
+	MOVW	R0, ret+16(FP)
+	RET
+
+TEXT runtime·mstart_nacl(SB),NOSPLIT,$0
+	MOVW	0(R9), R0 // TLS
+	MOVW	-8(R0), R1 // g
+	MOVW	-4(R0), R2 // m
+	MOVW	R2, g_m(R1)
+	MOVW	R1, g
+	B runtime·mstart(SB)
+
+TEXT runtime·nacl_nanosleep(SB),NOSPLIT,$0
+	MOVW	ts+0(FP), R0
+	MOVW	extra+4(FP), R1
+	NACL_SYSCALL(SYS_nanosleep)
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	NACL_SYSCALL(SYS_sched_yield)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$8
+	MOVW	addr+0(FP), R0
+	MOVW	n+4(FP), R1
+	MOVW	prot+8(FP), R2
+	MOVW	flags+12(FP), R3
+	MOVW	fd+16(FP), R4
+	// arg6:offset should be passed as a pointer (to int64)
+	MOVW	off+20(FP), R5
+	MOVW	R5, 4(R13)
+	MOVW	$0, R6
+	MOVW	R6, 8(R13)
+	MOVW	$4(R13), R5
+	MOVM.DB.W [R4,R5], (R13) // arg5 and arg6 are passed on stack
+	NACL_SYSCALL(SYS_mmap)
+	MOVM.IA.W (R13), [R4, R5]
+	CMP	$-4095, R0
+	RSB.HI	$0, R0
+	MOVW	R0, ret+24(FP)
+	RET
+
+TEXT time·now(SB),NOSPLIT,$16
+	MOVW	$0, R0 // real time clock
+	MOVW	$4(R13), R1
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVW	4(R13), R0 // low 32-bit sec
+	MOVW	8(R13), R1 // high 32-bit sec
+	MOVW	12(R13), R2 // nsec
+	MOVW	R0, sec+0(FP)
+	MOVW	R1, sec+4(FP)
+	MOVW	R2, sec+8(FP)
+	RET
+
+TEXT syscall·now(SB),NOSPLIT,$0
+	B time·now(SB)
+
+TEXT runtime·nacl_clock_gettime(SB),NOSPLIT,$0
+	MOVW	arg1+0(FP), R0
+	MOVW	arg2+4(FP), R1
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVW	R0, ret+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB),NOSPLIT,$16
+	MOVW	$0, R0 // real time clock
+	MOVW	$4(R13), R1
+	NACL_SYSCALL(SYS_clock_gettime)
+	MOVW	4(R13), R0 // low 32-bit sec
+	MOVW	8(R13), R1 // high 32-bit sec (ignored for now)
+	MOVW	12(R13), R2 // nsec
+	MOVW	$1000000000, R3
+	MULLU	R0, R3, (R1, R0)
+	MOVW	$0, R4
+	ADD.S	R2, R0
+	ADC	R4, R1
+	MOVW	R0, ret_lo+0(FP)
+	MOVW	R1, ret_hi+4(FP)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$80
+	// load g from thread context
+	MOVW	$ctxt+-4(FP), R0
+	MOVW	(16*4+10*4)(R0), g
+
+	// check that g exists
+	CMP	$0, g
+	BNE 	4(PC)
+	MOVW  	$runtime·badsignal2(SB), R11
+	BL	(R11)
+	RET
+
+	// save g
+	MOVW	g, R3
+	MOVW	g, 20(R13)
+
+	// g = m->gsignal
+	MOVW	g_m(g), R8
+	MOVW	m_gsignal(R8), g
+
+	// copy arguments for call to sighandler
+	MOVW	$11, R0
+	MOVW	R0, 4(R13) // signal
+	MOVW	$0, R0
+	MOVW	R0, 8(R13) // siginfo
+	MOVW	$ctxt+-4(FP), R0
+	MOVW	R0, 12(R13) // context
+	MOVW	R3, 16(R13) // g
+
+	BL	runtime·sighandler(SB)
+
+	// restore g
+	MOVW	20(R13), g
+
+sigtramp_ret:
+	// Enable exceptions again.
+	NACL_SYSCALL(SYS_exception_clear_flag)
+
+	// Restore registers as best we can. Impossible to do perfectly.
+	// See comment in sys_nacl_386.s for extended rationale.
+	MOVW	$ctxt+-4(FP), R1
+	ADD	$64, R1
+	MOVW	(0*4)(R1), R0
+	MOVW	(2*4)(R1), R2
+	MOVW	(3*4)(R1), R3
+	MOVW	(4*4)(R1), R4
+	MOVW	(5*4)(R1), R5
+	MOVW	(6*4)(R1), R6
+	MOVW	(7*4)(R1), R7
+	MOVW	(8*4)(R1), R8
+	// cannot write to R9
+	MOVW	(10*4)(R1), g
+	MOVW	(11*4)(R1), R11
+	MOVW	(12*4)(R1), R12
+	MOVW	(13*4)(R1), R13
+	MOVW	(14*4)(R1), R14
+	MOVW	(15*4)(R1), R1
+	B	(R1)
+
+nog:
+	MOVW	$0, R0
+	RET
+
+TEXT runtime·nacl_sysinfo(SB),NOSPLIT,$16
+	RET
+
+TEXT runtime·casp(SB),NOSPLIT,$0
+	B	runtime·cas(SB)
+
+// This is only valid for ARMv6+, however, NaCl/ARM is only defined
+// for ARMv7A anyway.
+// bool armcas(int32 *val, int32 old, int32 new)
+// AtomiBLy:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·cas(SB),NOSPLIT,$0
+	B runtime·armcas(SB)
+
+TEXT runtime·read_tls_fallback(SB),NOSPLIT,$-4
+	WORD $0xe7fedef0 // NACL_INSTR_ARM_ABORT_NOW (UDF #0xEDE0)
diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
new file mode 100644
index 000000000..23f2f6bd1
--- /dev/null
+++ b/src/runtime/sys_netbsd_386.s
@@ -0,0 +1,384 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for 386, NetBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-4
+	MOVL	$1, AX
+	INT	$0x80
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-4
+	MOVL	$310, AX		// sys__lwp_exit
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-4
+	MOVL	$5, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-4
+	MOVL	$6, AX
+	INT	$0x80
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-4
+	MOVL	$3, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-4
+	MOVL	$4, AX			// sys_write
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$24
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVL	AX, 12(SP)		// tv_sec - l32
+	MOVL	$0, 16(SP)		// tv_sec - h32
+	MOVL	$1000, AX
+	MULL	DX
+	MOVL	AX, 20(SP)		// tv_nsec
+
+	MOVL	$0, 0(SP)
+	LEAL	12(SP), AX
+	MOVL	AX, 4(SP)		// arg 1 - rqtp
+	MOVL	$0, 8(SP)		// arg 2 - rmtp
+	MOVL	$430, AX		// sys_nanosleep
+	INT	$0x80
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$12
+	MOVL	$311, AX		// sys__lwp_self
+	INT	$0x80
+	MOVL	$0, 0(SP)
+	MOVL	AX, 4(SP)		// arg 1 - target
+	MOVL	sig+0(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - signo
+	MOVL	$318, AX		// sys__lwp_kill
+	INT	$0x80
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$36
+	LEAL	addr+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - addr
+	MOVSL				// arg 2 - len
+	MOVSL				// arg 3 - prot
+	MOVSL				// arg 4 - flags
+	MOVSL				// arg 5 - fd
+	MOVL	$0, AX
+	STOSL				// arg 6 - pad
+	MOVSL				// arg 7 - offset
+	MOVL	$0, AX			// top 32 bits of file offset
+	STOSL
+	MOVL	$197, AX		// sys_mmap
+	INT	$0x80
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$-4
+	MOVL	$73, AX			// sys_munmap
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$-4
+	MOVL	$75, AX			// sys_madvise
+	INT	$0x80
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$-4
+	MOVL	$425, AX		// sys_setitimer
+	INT	$0x80
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	LEAL	12(SP), BX
+	MOVL	$0, 4(SP)		// arg 1 - clock_id
+	MOVL	BX, 8(SP)		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
+	INT	$0x80
+
+	MOVL	12(SP), AX		// sec - l32
+	MOVL	AX, sec+0(FP)
+	MOVL	16(SP), AX		// sec - h32
+	MOVL	AX, sec+4(FP)
+
+	MOVL	20(SP), BX		// nsec
+	MOVL	BX, nsec+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB),NOSPLIT,$32
+	LEAL	12(SP), BX
+	MOVL	$0, 4(SP)		// arg 1 - clock_id
+	MOVL	BX, 8(SP)		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
+	INT	$0x80
+
+	MOVL	16(SP), CX		// sec - h32
+	IMULL	$1000000000, CX
+
+	MOVL	12(SP), AX		// sec - l32
+	MOVL	$1000000000, BX
+	MULL	BX			// result in dx:ax
+
+	MOVL	20(SP), BX		// nsec
+	ADDL	BX, AX
+	ADCL	CX, DX			// add high bits with carry
+
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+TEXT runtime·getcontext(SB),NOSPLIT,$-4
+	MOVL	$307, AX		// sys_getcontext
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$-4
+	MOVL	$293, AX		// sys_sigprocmask
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigreturn_tramp(SB),NOSPLIT,$0
+	LEAL	140(SP), AX		// Load address of ucontext
+	MOVL	AX, 4(SP)
+	MOVL	$308, AX		// sys_setcontext
+	INT	$0x80
+	MOVL	$-1, 4(SP)		// Something failed...
+	MOVL	$1, AX			// sys_exit
+	INT	$0x80
+
+TEXT runtime·sigaction(SB),NOSPLIT,$24
+	LEAL	sig+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - sig
+	MOVSL				// arg 2 - act
+	MOVSL				// arg 3 - oact
+	LEAL	runtime·sigreturn_tramp(SB), AX
+	STOSL				// arg 4 - tramp
+	MOVL	$2, AX
+	STOSL				// arg 5 - vers
+	MOVL	$340, AX		// sys___sigaction_sigtramp
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$44
+	get_tls(CX)
+
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVL	DI, 20(SP)
+
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+
+	// copy arguments for call to sighandler
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	info+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	context+8(FP), BX
+	MOVL	BX, 8(SP)
+	MOVL	DI, 12(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+	RET
+
+// int32 lwp_create(void *context, uintptr flags, void *lwpid);
+TEXT runtime·lwp_create(SB),NOSPLIT,$16
+	MOVL	$0, 0(SP)
+	MOVL	ctxt+0(FP), AX
+	MOVL	AX, 4(SP)		// arg 1 - context
+	MOVL	flags+4(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - flags
+	MOVL	lwpid+8(FP), AX
+	MOVL	AX, 12(SP)		// arg 3 - lwpid
+	MOVL	$309, AX		// sys__lwp_create
+	INT	$0x80
+	JCC	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
+
+	// Set FS to point at m->tls
+	LEAL	m_tls(BX), BP
+	PUSHAL				// save registers
+	PUSHL	BP
+	CALL	runtime·settls(SB)
+	POPL	AX
+	POPAL
+
+	// Now segment is established.  Initialize m, g.
+	get_tls(AX)
+	MOVL	DX, g(AX)
+	MOVL	BX, g_m(DX)
+
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	MOVL	0(DX), DX		// paranoia; check they are not nil
+	MOVL	0(BX), BX
+
+	// more paranoia; check that stack splitting code works
+	PUSHAL
+	CALL	runtime·emptyfunc(SB)
+	POPAL
+
+	// Call fn
+	CALL	SI
+
+	CALL	runtime·exit1(SB)
+	MOVL	$0x1234, 0x1005
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVL	$281, AX		// sys___sigaltstack14
+	MOVL	new+4(SP), BX
+	MOVL	old+8(SP), CX
+	INT	$0x80
+	CMPL	AX, $0xfffff001
+	JLS	2(PC)
+	INT	$3
+	RET
+
+TEXT runtime·setldt(SB),NOSPLIT,$8
+	// Under NetBSD we set the GS base instead of messing with the LDT.
+	MOVL	16(SP), AX		// tls0
+	MOVL	AX, 0(SP)
+	CALL	runtime·settls(SB)
+	RET
+
+TEXT runtime·settls(SB),NOSPLIT,$16
+	// adjust for ELF: wants to use -8(GS) and -4(GS) for g and m
+	MOVL	base+0(FP), CX
+	ADDL	$8, CX
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	CX, 4(SP)		// arg 1 - ptr
+	MOVL	$317, AX		// sys__lwp_setprivate
+	INT	$0x80
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVL	$350, AX		// sys_sched_yield
+	INT	$0x80
+	RET
+
+TEXT runtime·lwp_park(SB),NOSPLIT,$-4
+	MOVL	$434, AX		// sys__lwp_park
+	INT	$0x80
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·lwp_unpark(SB),NOSPLIT,$-4
+	MOVL	$321, AX		// sys__lwp_unpark
+	INT	$0x80
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·lwp_self(SB),NOSPLIT,$-4
+	MOVL	$311, AX		// sys__lwp_self
+	INT	$0x80
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$28
+	LEAL	mib+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - name
+	MOVSL				// arg 2 - namelen
+	MOVSL				// arg 3 - oldp
+	MOVSL				// arg 4 - oldlenp
+	MOVSL				// arg 5 - newp
+	MOVSL				// arg 6 - newlen
+	MOVL	$202, AX		// sys___sysctl
+	INT	$0x80
+	JCC	3(PC)
+	NEGL	AX
+	RET
+	MOVL	$0, AX
+	RET
+
+GLOBL runtime·tlsoffset(SB),NOPTR,$4
+
+// int32 runtime·kqueue(void)
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVL	$344, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout)
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	$435, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·closeonexec(int32 fd)
+TEXT runtime·closeonexec(SB),NOSPLIT,$32
+	MOVL	$92, AX		// fcntl
+	// 0(SP) is where the caller PC would be; kernel skips it
+	MOVL	fd+0(FP), BX
+	MOVL	BX, 4(SP)	// fd
+	MOVL	$2, 8(SP)	// F_SETFD
+	MOVL	$1, 12(SP)	// FD_CLOEXEC
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	RET
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
new file mode 100644
index 000000000..eb9766d3f
--- /dev/null
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -0,0 +1,358 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for AMD64, NetBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// int32 lwp_create(void *context, uintptr flags, void *lwpid)
+TEXT runtime·lwp_create(SB),NOSPLIT,$0
+	MOVQ	ctxt+0(FP), DI
+	MOVQ	flags+8(FP), SI
+	MOVQ	lwpid+16(FP), DX
+	MOVL	$309, AX		// sys__lwp_create
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
+	
+	// Set FS to point at m->tls.
+	LEAQ	m_tls(R8), DI
+	CALL	runtime·settls(SB)
+
+	// Set up new stack.
+	get_tls(CX)
+	MOVQ	R8, g_m(R9)
+	MOVQ	R9, g(CX)
+	CALL	runtime·stackcheck(SB)
+
+	// Call fn
+	CALL	R12
+
+	// It shouldn't return.  If it does, exit.
+	MOVL	$310, AX		// sys__lwp_exit
+	SYSCALL
+	JMP	-3(PC)			// keep exiting
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVL	$350, AX		// sys_sched_yield
+	SYSCALL
+	RET
+
+TEXT runtime·lwp_park(SB),NOSPLIT,$0
+	MOVQ	abstime+0(FP), DI		// arg 1 - abstime
+	MOVL	unpark+8(FP), SI		// arg 2 - unpark
+	MOVQ	hint+16(FP), DX		// arg 3 - hint
+	MOVQ	unparkhint+24(FP), R10		// arg 4 - unparkhint
+	MOVL	$434, AX		// sys__lwp_park
+	SYSCALL
+	MOVL	AX, ret+32(FP)
+	RET
+
+TEXT runtime·lwp_unpark(SB),NOSPLIT,$0
+	MOVL	lwp+0(FP), DI		// arg 1 - lwp
+	MOVQ	hint+8(FP), SI		// arg 2 - hint
+	MOVL	$321, AX		// sys__lwp_unpark
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·lwp_self(SB),NOSPLIT,$0
+	MOVL	$311, AX		// sys__lwp_self
+	SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-8
+	MOVL	code+0(FP), DI		// arg 1 - exit status
+	MOVL	$1, AX			// sys_exit
+	SYSCALL
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-8
+	MOVL	$310, AX		// sys__lwp_exit
+	SYSCALL
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-8
+	MOVQ	name+0(FP), DI		// arg 1 pathname
+	MOVL	mode+8(FP), SI		// arg 2 flags
+	MOVL	perm+12(FP), DX		// arg 3 mode
+	MOVL	$5, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVL	$6, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$3, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-8
+	MOVQ	fd+0(FP), DI		// arg 1 - fd
+	MOVQ	p+8(FP), SI		// arg 2 - buf
+	MOVL	n+16(FP), DX		// arg 3 - nbyte
+	MOVL	$4, AX			// sys_write
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)		// tv_sec
+	MOVL	$1000, AX
+	MULL	DX
+	MOVQ	AX, 8(SP)		// tv_nsec
+
+	MOVQ	SP, DI			// arg 1 - rqtp
+	MOVQ	$0, SI			// arg 2 - rmtp
+	MOVL	$430, AX		// sys_nanosleep
+	SYSCALL
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	MOVL	$311, AX		// sys__lwp_self
+	SYSCALL
+	MOVQ	AX, DI			// arg 1 - target
+	MOVL	sig+0(FP), SI		// arg 2 - signo
+	MOVL	$318, AX		// sys__lwp_kill
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$-8
+	MOVL	mode+0(FP), DI		// arg 1 - which
+	MOVQ	new+8(FP), SI		// arg 2 - itv
+	MOVQ	old+16(FP), DX		// arg 3 - oitv
+	MOVL	$425, AX		// sys_setitimer
+	SYSCALL
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
+	SYSCALL
+	MOVQ	8(SP), AX		// sec
+	MOVL	16(SP), DX		// nsec
+
+	// sec is in AX, nsec in DX
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$32
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
+	SYSCALL
+	MOVQ	8(SP), AX		// sec
+	MOVL	16(SP), DX		// nsec
+
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT runtime·getcontext(SB),NOSPLIT,$-8
+	MOVQ	ctxt+0(FP), DI		// arg 1 - context
+	MOVL	$307, AX		// sys_getcontext
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVL	mode+0(FP), DI		// arg 1 - how
+	MOVQ	new+8(FP), SI		// arg 2 - set
+	MOVQ	old+16(FP), DX		// arg 3 - oset
+	MOVL	$293, AX		// sys_sigprocmask
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigreturn_tramp(SB),NOSPLIT,$-8
+	MOVQ	R15, DI			// Load address of ucontext
+	MOVQ	$308, AX		// sys_setcontext
+	SYSCALL
+	MOVQ	$-1, DI			// Something failed...
+	MOVL	$1, AX			// sys_exit
+	SYSCALL
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-8
+	MOVL	sig+0(FP), DI		// arg 1 - signum
+	MOVQ	new+8(FP), SI		// arg 2 - nsa
+	MOVQ	old+16(FP), DX		// arg 3 - osa
+					// arg 4 - tramp
+	LEAQ	runtime·sigreturn_tramp(SB), R10
+	MOVQ	$2, R8			// arg 5 - vers
+	MOVL	$340, AX		// sys___sigaction_sigtramp
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	get_tls(BX)
+
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	5(PC)
+	MOVQ	DI, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVQ	R10, 40(SP)
+
+	// g = m->signal
+	MOVQ	g_m(R10), BP
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+
+	MOVQ	DI, 0(SP)
+	MOVQ	SI, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	R10, 24(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVQ	40(SP), R10
+	MOVQ	R10, g(BX)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	prot+16(FP), DX		// arg 3 - prot
+	MOVL	flags+20(FP), R10		// arg 4 - flags
+	MOVL	fd+24(FP), R8		// arg 5 - fd
+	MOVL	off+28(FP), R9
+	SUBQ	$16, SP
+	MOVQ	R9, 8(SP)		// arg 7 - offset (passed on stack)
+	MOVQ	$0, R9			// arg 6 - pad
+	MOVL	$197, AX		// sys_mmap
+	SYSCALL
+	ADDQ	$16, SP
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	$73, AX			// sys_munmap
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	flags+16(FP), DX	// arg 3 - behav
+	MOVQ	$75, AX			// sys_madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVQ	new+8(SP), DI		// arg 1 - nss
+	MOVQ	old+16(SP), SI		// arg 2 - oss
+	MOVQ	$281, AX		// sys___sigaltstack14
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$8
+	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
+	ADDQ	$16, DI			// arg 1 - ptr
+	MOVQ	$317, AX		// sys__lwp_setprivate
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVQ	mib+0(FP), DI		// arg 1 - name
+	MOVL	miblen+8(FP), SI		// arg 2 - namelen
+	MOVQ	out+16(FP), DX		// arg 3 - oldp
+	MOVQ	size+24(FP), R10		// arg 4 - oldlenp
+	MOVQ	dst+32(FP), R8		// arg 5 - newp
+	MOVQ	ndst+40(FP), R9		// arg 6 - newlen
+	MOVQ	$202, AX		// sys___sysctl
+	SYSCALL
+	JCC 4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// int32 runtime·kqueue(void)
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVQ	$0, DI
+	MOVL	$344, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout)
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI
+	MOVQ	ev1+8(FP), SI
+	MOVL	nev1+16(FP), DX
+	MOVQ	ev2+24(FP), R10
+	MOVL	nev2+32(FP), R8
+	MOVQ	ts+40(FP), R9
+	MOVL	$435, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd)
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI	// fd
+	MOVQ	$2, SI		// F_SETFD
+	MOVQ	$1, DX		// FD_CLOEXEC
+	MOVL	$92, AX		// fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
new file mode 100644
index 000000000..039a0832e
--- /dev/null
+++ b/src/runtime/sys_netbsd_arm.s
@@ -0,0 +1,351 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for ARM, NetBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-4
+	MOVW 0(FP), R0	// arg 1 exit status
+	SWI $0xa00001
+	MOVW.CS $0, R8	// crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-4
+	SWI $0xa00136	// sys__lwp_exit
+	MOVW $1, R8	// crash
+	MOVW R8, (R8)
+	RET
+	
+TEXT runtime·open(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	SWI $0xa00005
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0
+	SWI $0xa00006
+	MOVW	R0, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	SWI $0xa00003
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-4
+	MOVW	0(FP), R0	// arg 1 - fd
+	MOVW	4(FP), R1	// arg 2 - buf
+	MOVW	8(FP), R2	// arg 3 - nbyte
+	SWI $0xa00004	// sys_write
+	MOVW	R0, ret+12(FP)
+	RET
+
+// int32 lwp_create(void *context, uintptr flags, void *lwpid)
+TEXT runtime·lwp_create(SB),NOSPLIT,$0
+	MOVW ctxt+0(FP), R0
+	MOVW flags+4(FP), R1
+	MOVW lwpid+8(FP), R2
+	SWI $0xa00135	// sys__lwp_create
+	MOVW	R0, ret+12(FP)
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	SWI $0xa0015e	// sys_sched_yield
+	RET
+
+TEXT runtime·lwp_park(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// arg 1 - abstime
+	MOVW 4(FP), R1	// arg 2 - unpark
+	MOVW 8(FP), R2	// arg 3 - hint
+	MOVW 12(FP), R3	// arg 4 - unparkhint
+	SWI $0xa001b2	// sys__lwp_park
+	MOVW	R0, ret+16(FP)
+	RET
+
+TEXT runtime·lwp_unpark(SB),NOSPLIT,$0
+	MOVW	0(FP), R0	// arg 1 - lwp
+	MOVW	4(FP), R1	// arg 2 - hint
+	SWI $0xa00141 // sys__lwp_unpark
+	MOVW	R0, ret+8(FP)
+	RET
+
+TEXT runtime·lwp_self(SB),NOSPLIT,$0
+	SWI $0xa00137	// sys__lwp_self
+	MOVW	R0, ret+0(FP)
+	RET
+
+TEXT runtime·lwp_tramp(SB),NOSPLIT,$0
+	MOVW R0, g_m(R1)
+	MOVW R1, g
+
+	BL runtime·emptyfunc(SB) // fault if stack check is wrong
+	BL (R2)
+	MOVW $2, R8  // crash (not reached)
+	MOVW R8, (R8)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVW usec+0(FP), R0
+	MOVW R0, R2
+	MOVW $1000000, R1
+	DIV R1, R0
+	// 0(R13) is the saved LR, don't use it
+	MOVW R0, 4(R13) // tv_sec.low
+	MOVW $0, R0
+	MOVW R0, 8(R13) // tv_sec.high
+	MOD R1, R2
+	MOVW $1000, R1
+	MUL R1, R2
+	MOVW R2, 12(R13) // tv_nsec
+
+	MOVW $4(R13), R0 // arg 1 - rqtp
+	MOVW $0, R1      // arg 2 - rmtp
+	SWI $0xa001ae	// sys_nanosleep
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	SWI $0xa00137	// sys__lwp_self, the returned R0 is arg 1
+	MOVW	sig+0(FP), R1	// arg 2 - signal
+	SWI $0xa0013e	// sys__lwp_kill
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$-4
+	MOVW 0(FP), R0	// arg 1 - which
+	MOVW 4(FP), R1	// arg 2 - itv
+	MOVW 8(FP), R2	// arg 3 - oitv
+	SWI $0xa001a9	// sys_setitimer
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVW $0, R0	// CLOCK_REALTIME
+	MOVW $8(R13), R1
+	SWI $0xa001ab	// clock_gettime
+
+	MOVW 8(R13), R0	// sec.low
+	MOVW 12(R13), R1 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW R0, 0(FP)
+	MOVW R1, 4(FP)
+	MOVW R2, 8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), NOSPLIT, $32
+	MOVW $0, R0 // CLOCK_REALTIME
+	MOVW $8(R13), R1
+	SWI $0xa001ab	// clock_gettime
+
+	MOVW 8(R13), R0 // sec.low
+	MOVW 12(R13), R4 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW $1000000000, R3
+	MULLU R0, R3, (R1, R0)
+	MUL R3, R4
+	ADD.S R2, R0
+	ADC R4, R1
+
+	MOVW R0, ret_lo+0(FP)
+	MOVW R1, ret_hi+4(FP)
+	RET
+
+TEXT runtime·getcontext(SB),NOSPLIT,$-4
+	MOVW 0(FP), R0	// arg 1 - context
+	SWI $0xa00133	// sys_getcontext
+	MOVW.CS $0, R8	// crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// arg 1 - how
+	MOVW 4(FP), R1	// arg 2 - set
+	MOVW 8(FP), R2	// arg 3 - oset
+	SWI $0xa00125	// sys_sigprocmask
+	MOVW.CS $0, R8	// crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·sigreturn_tramp(SB),NOSPLIT,$-4
+	// on entry, SP points to siginfo, we add sizeof(ucontext)
+	// to SP to get a pointer to ucontext.
+	ADD $0x80, R13, R0 // 0x80 == sizeof(UcontextT)
+	SWI $0xa00134	// sys_setcontext
+	// something failed, we have to exit
+	MOVW $0x4242, R0 // magic return number
+	SWI $0xa00001	// sys_exit
+	B -2(PC)	// continue exit
+
+TEXT runtime·sigaction(SB),NOSPLIT,$4
+	MOVW 0(FP), R0	// arg 1 - signum
+	MOVW 4(FP), R1	// arg 2 - nsa
+	MOVW 8(FP), R2	// arg 3 - osa
+	MOVW $runtime·sigreturn_tramp(SB), R3	// arg 4 - tramp
+	MOVW $2, R4	// arg 5 - vers
+	MOVW R4, 4(R13)
+	ADD $4, R13	// pass arg 5 on stack
+	SWI $0xa00154	// sys___sigaction_sigtramp
+	SUB $4, R13
+	MOVW.CS $3, R8	// crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$24
+	// this might be called in external code context,
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
+	MOVW	R0, 4(R13) // signum
+	MOVB	runtime·iscgo(SB), R0
+	CMP 	$0, R0
+	BL.NE	runtime·load_g(SB)
+
+	CMP $0, g
+	BNE 4(PC)
+	// signal number is already prepared in 4(R13)
+	MOVW $runtime·badsignal(SB), R11
+	BL (R11)
+	RET
+
+	// save g
+	MOVW g, R4
+	MOVW g, 20(R13)
+
+	// g = m->signal
+	MOVW g_m(g), R8
+	MOVW m_gsignal(R8), g
+
+	// R0 is already saved
+	MOVW R1, 8(R13) // info
+	MOVW R2, 12(R13) // context
+	MOVW R4, 16(R13) // gp
+
+	BL runtime·sighandler(SB)
+
+	// restore g
+	MOVW 20(R13), g
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$12
+	MOVW 0(FP), R0	// arg 1 - addr
+	MOVW 4(FP), R1	// arg 2 - len
+	MOVW 8(FP), R2	// arg 3 - prot
+	MOVW 12(FP), R3	// arg 4 - flags
+	// arg 5 (fid) and arg6 (offset_lo, offset_hi) are passed on stack
+	// note the C runtime only passes the 32-bit offset_lo to us
+	MOVW 16(FP), R4		// arg 5
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R5		// arg 6 lower 32-bit
+	MOVW R5, 8(R13)
+	MOVW $0, R6 // higher 32-bit for arg 6
+	MOVW R6, 12(R13)
+	ADD $4, R13 // pass arg 5 and arg 6 on stack
+	SWI $0xa000c5	// sys_mmap
+	SUB $4, R13
+	MOVW	R0, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// arg 1 - addr
+	MOVW 4(FP), R1	// arg 2 - len
+	SWI $0xa00049	// sys_munmap
+	MOVW.CS $0, R8	// crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// arg 1 - addr
+	MOVW 4(FP), R1	// arg 2 - len
+	MOVW 8(FP), R2	// arg 3 - behav
+	SWI $0xa0004b	// sys_madvise
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-4
+	MOVW 0(FP), R0	// arg 1 - nss
+	MOVW 4(FP), R1	// arg 2 - oss
+	SWI $0xa00119	// sys___sigaltstack14
+	MOVW.CS $0, R8	// crash on syscall failure
+	MOVW.CS R8, (R8)
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$8
+	MOVW 0(FP), R0	// arg 1 - name
+	MOVW 4(FP), R1	// arg 2 - namelen
+	MOVW 8(FP), R2	// arg 3 - oldp
+	MOVW 12(FP), R3	// arg 4 - oldlenp
+	MOVW 16(FP), R4	// arg 5 - newp
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R4	// arg 6 - newlen
+	MOVW R4, 8(R13)
+	ADD $4, R13	// pass arg 5 and 6 on stack
+	SWI $0xa000ca	// sys___sysctl
+	SUB $4, R13
+	MOVW	R0, ret+24(FP)
+	RET
+
+// int32 runtime·kqueue(void)
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	SWI $0xa00158	// sys_kqueue
+	RSB.CS $0, R0
+	MOVW	R0, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout)
+TEXT runtime·kevent(SB),NOSPLIT,$8
+	MOVW 0(FP), R0	// kq
+	MOVW 4(FP), R1	// changelist
+	MOVW 8(FP), R2	// nchanges
+	MOVW 12(FP), R3	// eventlist
+	MOVW 16(FP), R4	// nevents
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R4	// timeout
+	MOVW R4, 8(R13)
+	ADD $4, R13	// pass arg 5 and 6 on stack
+	SWI $0xa001b3	// sys___kevent50
+	RSB.CS $0, R0
+	SUB $4, R13
+	MOVW	R0, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd)
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVW 0(FP), R0	// fd
+	MOVW $2, R1	// F_SETFD
+	MOVW $1, R2	// FD_CLOEXEC
+	SWI $0xa0005c	// sys_fcntl
+	RET
+
+TEXT runtime·casp(SB),NOSPLIT,$0
+	B	runtime·cas(SB)
+
+// TODO(minux): this is only valid for ARMv6+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·cas(SB),NOSPLIT,$0
+	B runtime·armcas(SB)
+
+TEXT runtime·read_tls_fallback(SB),NOSPLIT,$-4
+	MOVM.WP [R1, R2, R3, R12], (R13)
+	SWI $0x00a0013c // _lwp_getprivate
+	MOVM.IAW    (R13), [R1, R2, R3, R12]
+	RET
diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s
new file mode 100644
index 000000000..5cda7768a
--- /dev/null
+++ b/src/runtime/sys_openbsd_386.s
@@ -0,0 +1,398 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for 386, OpenBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+#define	CLOCK_MONOTONIC	$3
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-4
+	MOVL	$1, AX
+	INT	$0x80
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$8
+	MOVL	$0, 0(SP)
+	MOVL	$0, 4(SP)		// arg 1 - notdead
+	MOVL	$302, AX		// sys___threxit
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-4
+	MOVL	$5, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-4
+	MOVL	$6, AX
+	INT	$0x80
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-4
+	MOVL	$3, AX
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-4
+	MOVL	$4, AX			// sys_write
+	INT	$0x80
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$24
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVL	AX, 12(SP)		// tv_sec - l32
+	MOVL	$0, 16(SP)		// tv_sec - h32
+	MOVL	$1000, AX
+	MULL	DX
+	MOVL	AX, 20(SP)		// tv_nsec
+
+	MOVL	$0, 0(SP)
+	LEAL	12(SP), AX
+	MOVL	AX, 4(SP)		// arg 1 - rqtp
+	MOVL	$0, 8(SP)		// arg 2 - rmtp
+	MOVL	$91, AX			// sys_nanosleep
+	INT	$0x80
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$12
+	MOVL	$299, AX		// sys_getthrid
+	INT	$0x80
+	MOVL	$0, 0(SP)
+	MOVL	AX, 4(SP)		// arg 1 - pid
+	MOVL	sig+0(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - signum
+	MOVL	$37, AX			// sys_kill
+	INT	$0x80
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$36
+	LEAL	addr+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - addr
+	MOVSL				// arg 2 - len
+	MOVSL				// arg 3 - prot
+	MOVSL				// arg 4 - flags
+	MOVSL				// arg 5 - fd
+	MOVL	$0, AX
+	STOSL				// arg 6 - pad
+	MOVSL				// arg 7 - offset
+	MOVL	$0, AX			// top 32 bits of file offset
+	STOSL
+	MOVL	$197, AX		// sys_mmap
+	INT	$0x80
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$-4
+	MOVL	$73, AX			// sys_munmap
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$-4
+	MOVL	$75, AX			// sys_madvise
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$-4
+	MOVL	$69, AX
+	INT	$0x80
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	LEAL	12(SP), BX
+	MOVL	$0, 4(SP)		// arg 1 - clock_id
+	MOVL	BX, 8(SP)		// arg 2 - tp
+	MOVL	$87, AX			// sys_clock_gettime
+	INT	$0x80
+
+	MOVL	12(SP), AX		// sec - l32
+	MOVL	AX, sec+0(FP)
+	MOVL	16(SP), AX		// sec - h32
+	MOVL	AX, sec+4(FP)
+
+	MOVL	20(SP), BX		// nsec
+	MOVL	BX, nsec+8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB),NOSPLIT,$32
+	LEAL	12(SP), BX
+	MOVL	CLOCK_MONOTONIC, 4(SP)	// arg 1 - clock_id
+	MOVL	BX, 8(SP)		// arg 2 - tp
+	MOVL	$87, AX			// sys_clock_gettime
+	INT	$0x80
+
+	MOVL    16(SP), CX		// sec - h32
+	IMULL   $1000000000, CX
+
+	MOVL    12(SP), AX		// sec - l32
+	MOVL    $1000000000, BX
+	MULL    BX			// result in dx:ax
+
+	MOVL	20(SP), BX		// nsec
+	ADDL	BX, AX
+	ADCL	CX, DX			// add high bits with carry
+
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-4
+	MOVL	$46, AX			// sys_sigaction
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$-4
+	MOVL	$48, AX			// sys_sigprocmask
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$44
+	get_tls(CX)
+
+	// check that g exists
+	MOVL	g(CX), DI
+	CMPL	DI, $0
+	JNE	6(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP 	sigtramp_ret
+
+	// save g
+	MOVL	DI, 20(SP)
+	
+	// g = m->gsignal
+	MOVL	g_m(DI), BX
+	MOVL	m_gsignal(BX), BX
+	MOVL	BX, g(CX)
+
+	// copy arguments for call to sighandler
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
+	MOVL	info+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	context+8(FP), BX
+	MOVL	BX, 8(SP)
+	MOVL	DI, 12(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(CX)
+	MOVL	20(SP), BX
+	MOVL	BX, g(CX)
+
+sigtramp_ret:
+	// call sigreturn
+	MOVL	context+8(FP), AX
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	AX, 4(SP)		// arg 1 - sigcontext
+	MOVL	$103, AX		// sys_sigreturn
+	INT	$0x80
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+// int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·tfork(SB),NOSPLIT,$12
+
+	// Copy mp, gp and fn from the parent stack onto the child stack.
+	MOVL	param+0(FP), AX
+	MOVL	8(AX), CX		// tf_stack
+	SUBL	$16, CX
+	MOVL	CX, 8(AX)
+	MOVL	mm+8(FP), SI
+	MOVL	SI, 0(CX)
+	MOVL	gg+12(FP), SI
+	MOVL	SI, 4(CX)
+	MOVL	fn+16(FP), SI
+	MOVL	SI, 8(CX)
+	MOVL	$1234, 12(CX)
+
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	param+0(FP), AX
+	MOVL	AX, 4(SP)		// arg 1 - param
+	MOVL	psize+4(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - psize
+	MOVL	$8, AX			// sys___tfork
+	INT	$0x80
+
+	// Return if tfork syscall failed.
+	JCC	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+20(FP)
+	RET
+
+	// In parent, return.
+	CMPL	AX, $0
+	JEQ	3(PC)
+	MOVL	AX, ret+20(FP)
+	RET
+
+	// Paranoia: check that SP is as we expect.
+	MOVL	12(SP), BP
+	CMPL	BP, $1234
+	JEQ	2(PC)
+	INT	$3
+
+	// Reload registers.
+	MOVL	0(SP), BX		// m
+	MOVL	4(SP), DX		// g
+	MOVL	8(SP), SI		// fn
+
+	// Set FS to point at m->tls.
+	LEAL	m_tls(BX), BP
+	PUSHAL				// save registers
+	PUSHL	BP
+	CALL	runtime·settls(SB)
+	POPL	AX
+	POPAL
+	
+	// Now segment is established.  Initialize m, g.
+	get_tls(AX)
+	MOVL	DX, g(AX)
+	MOVL	BX, g_m(DX)
+
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	MOVL	0(DX), DX		// paranoia; check they are not nil
+	MOVL	0(BX), BX
+
+	// More paranoia; check that stack splitting code works.
+	PUSHAL
+	CALL	runtime·emptyfunc(SB)
+	POPAL
+
+	// Call fn.
+	CALL	SI
+
+	CALL	runtime·exit1(SB)
+	MOVL	$0x1234, 0x1005
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVL	$288, AX		// sys_sigaltstack
+	MOVL	new+4(SP), BX
+	MOVL	old+8(SP), CX
+	INT	$0x80
+	CMPL	AX, $0xfffff001
+	JLS	2(PC)
+	INT	$3
+	RET
+
+TEXT runtime·setldt(SB),NOSPLIT,$4
+	// Under OpenBSD we set the GS base instead of messing with the LDT.
+	MOVL	tls0+4(FP), AX
+	MOVL	AX, 0(SP)
+	CALL	runtime·settls(SB)
+	RET
+
+TEXT runtime·settls(SB),NOSPLIT,$8
+	// adjust for ELF: wants to use -8(GS) and -4(GS) for g and m
+	MOVL	tlsbase+0(FP), CX
+	ADDL	$8, CX
+	MOVL	$0, 0(SP)		// syscall gap
+	MOVL	CX, 4(SP)		// arg 1 - tcb
+	MOVL	$329, AX		// sys___set_tcb
+	INT	$0x80
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-4
+	MOVL	$298, AX		// sys_sched_yield
+	INT	$0x80
+	RET
+
+TEXT runtime·thrsleep(SB),NOSPLIT,$-4
+	MOVL	$94, AX			// sys___thrsleep
+	INT	$0x80
+	MOVL	AX, ret+20(FP)
+	RET
+
+TEXT runtime·thrwakeup(SB),NOSPLIT,$-4
+	MOVL	$301, AX		// sys___thrwakeup
+	INT	$0x80
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$28
+	LEAL	mib+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - name
+	MOVSL				// arg 2 - namelen
+	MOVSL				// arg 3 - oldp
+	MOVSL				// arg 4 - oldlenp
+	MOVSL				// arg 5 - newp
+	MOVSL				// arg 6 - newlen
+	MOVL	$202, AX		// sys___sysctl
+	INT	$0x80
+	JCC	4(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVL	$269, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	$72, AX			// sys_kevent
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int32 runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$32
+	MOVL	$92, AX			// sys_fcntl
+	// 0(SP) is where the caller PC would be; kernel skips it
+	MOVL	fd+0(FP), BX
+	MOVL	BX, 4(SP)	// fd
+	MOVL	$2, 8(SP)	// F_SETFD
+	MOVL	$1, 12(SP)	// FD_CLOEXEC
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	RET
+
+GLOBL runtime·tlsoffset(SB),NOPTR,$4
diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
new file mode 100644
index 000000000..4e9db2390
--- /dev/null
+++ b/src/runtime/sys_openbsd_amd64.s
@@ -0,0 +1,350 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for AMD64, OpenBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+#define CLOCK_MONOTONIC	$3
+
+// int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·tfork(SB),NOSPLIT,$32
+
+	// Copy mp, gp and fn off parent stack for use by child.
+	MOVQ	mm+16(FP), R8
+	MOVQ	gg+24(FP), R9
+	MOVQ	fn+32(FP), R12
+
+	MOVQ	param+0(FP), DI
+	MOVQ	psize+8(FP), SI
+	MOVL	$8, AX			// sys___tfork
+	SYSCALL
+
+	// Return if tfork syscall failed.
+	JCC	4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+40(FP)
+	RET
+
+	// In parent, return.
+	CMPL	AX, $0
+	JEQ	3(PC)
+	MOVL	AX, ret+40(FP)
+	RET
+
+	// Set FS to point at m->tls.
+	LEAQ	m_tls(R8), DI
+	CALL	runtime·settls(SB)
+
+	// In child, set up new stack.
+	get_tls(CX)
+	MOVQ	R8, g_m(R9)
+	MOVQ	R9, g(CX)
+	CALL	runtime·stackcheck(SB)
+
+	// Call fn
+	CALL	R12
+
+	// It shouldn't return.  If it does, exit
+	MOVQ	$0, DI			// arg 1 - notdead
+	MOVL	$302, AX		// sys___threxit
+	SYSCALL
+	JMP	-3(PC)			// keep exiting
+
+TEXT runtime·osyield(SB),NOSPLIT,$0
+	MOVL	$298, AX		// sys_sched_yield
+	SYSCALL
+	RET
+
+TEXT runtime·thrsleep(SB),NOSPLIT,$0
+	MOVQ	ident+0(FP), DI		// arg 1 - ident
+	MOVL	clock_id+8(FP), SI		// arg 2 - clock_id
+	MOVQ	tsp+16(FP), DX		// arg 3 - tp
+	MOVQ	lock+24(FP), R10		// arg 4 - lock
+	MOVQ	abort+32(FP), R8		// arg 5 - abort
+	MOVL	$94, AX			// sys___thrsleep
+	SYSCALL
+	MOVL	AX, ret+40(FP)
+	RET
+
+TEXT runtime·thrwakeup(SB),NOSPLIT,$0
+	MOVQ	ident+0(FP), DI		// arg 1 - ident
+	MOVL	n+8(FP), SI		// arg 2 - n
+	MOVL	$301, AX		// sys___thrwakeup
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),NOSPLIT,$-8
+	MOVL	code+0(FP), DI		// arg 1 - exit status
+	MOVL	$1, AX			// sys_exit
+	SYSCALL
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-8
+	MOVQ	$0, DI			// arg 1 - notdead
+	MOVL	$302, AX		// sys___threxit
+	SYSCALL
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-8
+	MOVQ	name+0(FP), DI		// arg 1 pathname
+	MOVL	mode+8(FP), SI		// arg 2 flags
+	MOVL	perm+12(FP), DX		// arg 3 mode
+	MOVL	$5, AX
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVL	$6, AX
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8
+	MOVL	fd+0(FP), DI		// arg 1 fd
+	MOVQ	p+8(FP), SI		// arg 2 buf
+	MOVL	n+16(FP), DX		// arg 3 count
+	MOVL	$3, AX
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-8
+	MOVQ	fd+0(FP), DI		// arg 1 - fd
+	MOVQ	p+8(FP), SI		// arg 2 - buf
+	MOVL	n+16(FP), DX		// arg 3 - nbyte
+	MOVL	$4, AX			// sys_write
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16
+	MOVL	$0, DX
+	MOVL	usec+0(FP), AX
+	MOVL	$1000000, CX
+	DIVL	CX
+	MOVQ	AX, 0(SP)		// tv_sec
+	MOVL	$1000, AX
+	MULL	DX
+	MOVQ	AX, 8(SP)		// tv_nsec
+
+	MOVQ	SP, DI			// arg 1 - rqtp
+	MOVQ	$0, SI			// arg 2 - rmtp
+	MOVL	$91, AX			// sys_nanosleep
+	SYSCALL
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$16
+	MOVL	$299, AX		// sys_getthrid
+	SYSCALL
+	MOVQ	AX, DI			// arg 1 - pid
+	MOVL	sig+0(FP), SI		// arg 2 - signum
+	MOVL	$37, AX			// sys_kill
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$-8
+	MOVL	mode+0(FP), DI		// arg 1 - which
+	MOVQ	new+8(FP), SI		// arg 2 - itv
+	MOVQ	old+16(FP), DX		// arg 3 - oitv
+	MOVL	$69, AX			// sys_setitimer
+	SYSCALL
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), NOSPLIT, $32
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$87, AX			// sys_clock_gettime
+	SYSCALL
+	MOVQ	8(SP), AX		// sec
+	MOVQ	16(SP), DX		// nsec
+
+	// sec is in AX, nsec in DX
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$24
+	MOVQ	CLOCK_MONOTONIC, DI	// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$87, AX			// sys_clock_gettime
+	SYSCALL
+	MOVQ	8(SP), AX		// sec
+	MOVQ	16(SP), DX		// nsec
+
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT runtime·sigaction(SB),NOSPLIT,$-8
+	MOVL	sig+0(FP), DI		// arg 1 - signum
+	MOVQ	new+8(FP), SI		// arg 2 - nsa
+	MOVQ	old+16(FP), DX		// arg 3 - osa
+	MOVL	$46, AX
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),NOSPLIT,$0
+	MOVL	mode+0(FP), DI		// arg 1 - how
+	MOVL	new+4(FP), SI		// arg 2 - set
+	MOVL	$48, AX			// sys_sigprocmask
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	get_tls(BX)
+	
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	5(PC)
+	MOVQ	DI, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	RET
+
+	// save g
+	MOVQ	R10, 40(SP)
+	
+	// g = m->signal
+	MOVQ	g_m(R10), BP
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+	
+	MOVQ	DI, 0(SP)
+	MOVQ	SI, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	R10, 24(SP)
+	
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVQ	40(SP), R10
+	MOVQ	R10, g(BX)
+	RET
+
+TEXT runtime·mmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	prot+16(FP), DX		// arg 3 - prot
+	MOVL	flags+20(FP), R10		// arg 4 - flags
+	MOVL	fd+24(FP), R8		// arg 5 - fd
+	MOVL	off+28(FP), R9
+	SUBQ	$16, SP
+	MOVQ	R9, 8(SP)		// arg 7 - offset (passed on stack)
+	MOVQ	$0, R9			// arg 6 - pad
+	MOVL	$197, AX
+	SYSCALL
+	ADDQ	$16, SP
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	$73, AX			// sys_munmap
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	n+8(FP), SI		// arg 2 - len
+	MOVL	flags+16(FP), DX	// arg 3 - behav
+	MOVQ	$75, AX			// sys_madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVQ	new+8(SP), DI		// arg 1 - nss
+	MOVQ	old+16(SP), SI		// arg 2 - oss
+	MOVQ	$288, AX		// sys_sigaltstack
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$0
+	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
+	ADDQ	$16, DI
+	MOVQ	$329, AX		// sys___settcb
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVQ	mib+0(FP), DI		// arg 1 - name
+	MOVL	miblen+8(FP), SI		// arg 2 - namelen
+	MOVQ	out+16(FP), DX		// arg 3 - oldp
+	MOVQ	size+24(FP), R10		// arg 4 - oldlenp
+	MOVQ	dst+32(FP), R8		// arg 5 - newp
+	MOVQ	ndst+40(FP), R9		// arg 6 - newlen
+	MOVQ	$202, AX		// sys___sysctl
+	SYSCALL
+	JCC	4(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+	MOVL	$0, AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// int32 runtime·kqueue(void);
+TEXT runtime·kqueue(SB),NOSPLIT,$0
+	MOVQ	$0, DI
+	MOVQ	$0, SI
+	MOVQ	$0, DX
+	MOVL	$269, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI
+	MOVQ	ev1+8(FP), SI
+	MOVL	nev1+16(FP), DX
+	MOVQ	ev2+24(FP), R10
+	MOVL	nev2+32(FP), R8
+	MOVQ	ts+40(FP), R9
+	MOVL	$72, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+48(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$0
+	MOVL	fd+0(FP), DI	// fd
+	MOVQ	$2, SI		// F_SETFD
+	MOVQ	$1, DX		// FD_CLOEXEC
+	MOVL	$92, AX		// fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
new file mode 100644
index 000000000..a41b56258
--- /dev/null
+++ b/src/runtime/sys_plan9_386.s
@@ -0,0 +1,249 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// setldt(int entry, int address, int limit)
+TEXT runtime·setldt(SB),NOSPLIT,$0
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVL    $14, AX
+	INT     $64
+	MOVL	AX, ret+12(FP)
+	RET
+
+TEXT runtime·pread(SB),NOSPLIT,$0
+	MOVL    $50, AX
+	INT     $64
+	MOVL	AX, ret+20(FP)
+	RET
+
+TEXT runtime·pwrite(SB),NOSPLIT,$0
+	MOVL    $51, AX
+	INT     $64
+	MOVL	AX, ret+20(FP)
+	RET
+
+// int32 _seek(int64*, int32, int64, int32)
+TEXT _seek<>(SB),NOSPLIT,$0
+	MOVL	$39, AX
+	INT	$64
+	RET
+
+TEXT runtime·seek(SB),NOSPLIT,$24
+	LEAL	ret+16(FP), AX
+	MOVL	fd+0(FP), BX
+	MOVL	offset_lo+4(FP), CX
+	MOVL	offset_hi+8(FP), DX
+	MOVL	whence+12(FP), SI
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	MOVL	CX, 8(SP)
+	MOVL	DX, 12(SP)
+	MOVL	SI, 16(SP)
+	CALL	_seek<>(SB)
+	CMPL	AX, $0
+	JGE	3(PC)
+	MOVL	$-1, ret_lo+16(FP)
+	MOVL	$-1, ret_hi+20(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVL	$4, AX
+	INT	$64
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·exits(SB),NOSPLIT,$0
+	MOVL    $8, AX
+	INT     $64
+	RET
+
+TEXT runtime·brk_(SB),NOSPLIT,$0
+	MOVL    $24, AX
+	INT     $64
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·sleep(SB),NOSPLIT,$0
+	MOVL    $17, AX
+	INT     $64
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·plan9_semacquire(SB),NOSPLIT,$0
+	MOVL	$37, AX
+	INT	$64
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·plan9_tsemacquire(SB),NOSPLIT,$0
+	MOVL	$52, AX
+	INT	$64
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT nsec<>(SB),NOSPLIT,$0
+	MOVL	$53, AX
+	INT	$64
+	RET
+
+TEXT runtime·nsec(SB),NOSPLIT,$8
+	LEAL	ret+4(FP), AX
+	MOVL	AX, 0(SP)
+	CALL	nsec<>(SB)
+	CMPL	AX, $0
+	JGE	3(PC)
+	MOVL	$-1, ret_lo+4(FP)
+	MOVL	$-1, ret_hi+8(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$8-12
+	CALL	runtime·nanotime(SB)
+	MOVL	0(SP), AX
+	MOVL	4(SP), DX
+
+	MOVL	$1000000000, CX
+	DIVL	CX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
+
+TEXT runtime·notify(SB),NOSPLIT,$0
+	MOVL	$28, AX
+	INT	$64
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·noted(SB),NOSPLIT,$0
+	MOVL	$29, AX
+	INT	$64
+	MOVL	AX, ret+4(FP)
+	RET
+	
+TEXT runtime·plan9_semrelease(SB),NOSPLIT,$0
+	MOVL	$38, AX
+	INT	$64
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·rfork(SB),NOSPLIT,$0
+	MOVL	$19, AX
+	INT	$64
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
+	MOVL	newm+0(FP), CX
+	MOVL	m_g0(CX), DX
+
+	// Layout new m scheduler stack on os stack.
+	MOVL	SP, AX
+	MOVL	AX, (g_stack+stack_hi)(DX)
+	SUBL	$(64*1024), AX		// stack size
+	MOVL	AX, (g_stack+stack_lo)(DX)
+	MOVL	AX, g_stackguard0(DX)
+	MOVL	AX, g_stackguard1(DX)
+
+	// Initialize procid from TOS struct.
+	MOVL	_tos(SB), AX
+	MOVL	48(AX), AX
+	MOVL	AX, m_procid(CX)	// save pid as m->procid
+
+	// Finally, initialize g.
+	get_tls(BX)
+	MOVL	DX, g(BX)
+
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	CALL	runtime·mstart(SB)
+
+	MOVL	$0x1234, 0x1234		// not reached
+	RET
+
+// void sigtramp(void *ureg, int8 *note)
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	get_tls(AX)
+
+	// check that g exists
+	MOVL	g(AX), BX
+	CMPL	BX, $0
+	JNE	3(PC)
+	CALL	runtime·badsignal2(SB) // will exit
+	RET
+
+	// save args
+	MOVL	ureg+4(SP), CX
+	MOVL	note+8(SP), DX
+
+	// change stack
+	MOVL	g_m(BX), BX
+	MOVL	m_gsignal(BX), BP
+	MOVL	(g_stack+stack_hi)(BP), BP
+	MOVL	BP, SP
+
+	// make room for args and g
+	SUBL	$24, SP
+
+	// save g
+	MOVL	g(AX), BP
+	MOVL	BP, 20(SP)
+
+	// g = m->gsignal
+	MOVL	m_gsignal(BX), DI
+	MOVL	DI, g(AX)
+
+	// load args and call sighandler
+	MOVL	CX, 0(SP)
+	MOVL	DX, 4(SP)
+	MOVL	BP, 8(SP)
+
+	CALL	runtime·sighandler(SB)
+	MOVL	12(SP), AX
+
+	// restore g
+	get_tls(BX)
+	MOVL	20(SP), BP
+	MOVL	BP, g(BX)
+
+	// call noted(AX)
+	MOVL	AX, 0(SP)
+	CALL	runtime·noted(SB)
+	RET
+
+// Only used by the 64-bit runtime.
+TEXT runtime·setfpmasks(SB),NOSPLIT,$0
+	RET
+
+#define ERRMAX 128	/* from os_plan9.h */
+
+// void errstr(int8 *buf, int32 len)
+TEXT errstr<>(SB),NOSPLIT,$0
+	MOVL    $41, AX
+	INT     $64
+	RET
+
+// func errstr() string
+// Only used by package syscall.
+// Grab error string due to a syscall made
+// in entersyscall mode, without going
+// through the allocator (issue 4994).
+// See ../syscall/asm_plan9_386.s:/·Syscall/
+TEXT runtime·errstr(SB),NOSPLIT,$8-8
+	get_tls(AX)
+	MOVL	g(AX), BX
+	MOVL	g_m(BX), BX
+	MOVL	m_errstr(BX), CX
+	MOVL	CX, 0(SP)
+	MOVL	$ERRMAX, 4(SP)
+	CALL	errstr<>(SB)
+	CALL	runtime·findnull(SB)
+	MOVL	4(SP), AX
+	MOVL	AX, ret_len+4(FP)
+	MOVL	0(SP), AX
+	MOVL	AX, ret_base+0(FP)
+	RET
diff --git a/src/runtime/sys_plan9_amd64.s b/src/runtime/sys_plan9_amd64.s
new file mode 100644
index 000000000..3a96c2bf9
--- /dev/null
+++ b/src/runtime/sys_plan9_amd64.s
@@ -0,0 +1,254 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// setldt(int entry, int address, int limit)
+TEXT runtime·setldt(SB),NOSPLIT,$0
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$0
+	MOVQ	$14, BP
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·pread(SB),NOSPLIT,$0
+	MOVQ	$50, BP
+	SYSCALL
+	MOVL	AX, ret+32(FP)
+	RET
+
+TEXT runtime·pwrite(SB),NOSPLIT,$0
+	MOVQ	$51, BP
+	SYSCALL
+	MOVL	AX, ret+32(FP)
+	RET
+
+// int32 _seek(int64*, int32, int64, int32)
+TEXT _seek<>(SB),NOSPLIT,$0
+	MOVQ	$39, BP
+	SYSCALL
+	RET
+
+// int64 seek(int32, int64, int32)
+// Convenience wrapper around _seek, the actual system call.
+TEXT runtime·seek(SB),NOSPLIT,$32
+	LEAQ	ret+24(FP), AX
+	MOVL	fd+0(FP), BX
+	MOVQ	offset+8(FP), CX
+	MOVL	whence+16(FP), DX
+	MOVQ	AX, 0(SP)
+	MOVL	BX, 8(SP)
+	MOVQ	CX, 16(SP)
+	MOVL	DX, 24(SP)
+	CALL	_seek<>(SB)
+	CMPL	AX, $0
+	JGE	2(PC)
+	MOVQ	$-1, ret+24(FP)
+	RET
+
+TEXT runtime·close(SB),NOSPLIT,$0
+	MOVQ	$4, BP
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·exits(SB),NOSPLIT,$0
+	MOVQ	$8, BP
+	SYSCALL
+	RET
+
+TEXT runtime·brk_(SB),NOSPLIT,$0
+	MOVQ	$24, BP
+	SYSCALL
+	MOVQ	AX, ret+8(FP)
+	RET
+
+TEXT runtime·sleep(SB),NOSPLIT,$0
+	MOVQ	$17, BP
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·plan9_semacquire(SB),NOSPLIT,$0
+	MOVQ	$37, BP
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·plan9_tsemacquire(SB),NOSPLIT,$0
+	MOVQ	$52, BP
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·nsec(SB),NOSPLIT,$0
+	MOVQ	$53, BP
+	SYSCALL
+	MOVQ	AX, ret+8(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$8-12
+	CALL	runtime·nanotime(SB)
+	MOVQ	0(SP), AX
+
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	MOVQ	$1360296554856532783, AX
+	MULQ	CX
+	ADDQ	CX, DX
+	RCRQ	$1, DX
+	SHRQ	$29, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
+	RET
+
+TEXT runtime·notify(SB),NOSPLIT,$0
+	MOVQ	$28, BP
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·noted(SB),NOSPLIT,$0
+	MOVQ	$29, BP
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+	
+TEXT runtime·plan9_semrelease(SB),NOSPLIT,$0
+	MOVQ	$38, BP
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·rfork(SB),NOSPLIT,$0
+	MOVQ	$19, BP
+	SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$0
+	MOVQ	newm+0(FP), CX
+	MOVQ	m_g0(CX), DX
+
+	// Layout new m scheduler stack on os stack.
+	MOVQ	SP, AX
+	MOVQ	AX, (g_stack+stack_hi)(DX)
+	SUBQ	$(64*1024), AX		// stack size
+	MOVQ	AX, (g_stack+stack_lo)(DX)
+	MOVQ	AX, g_stackguard0(DX)
+	MOVQ	AX, g_stackguard1(DX)
+
+	// Initialize procid from TOS struct.
+	MOVQ	_tos(SB), AX
+	MOVL	64(AX), AX
+	MOVQ	AX, m_procid(CX)	// save pid as m->procid
+
+	// Finally, initialize g.
+	get_tls(BX)
+	MOVQ	DX, g(BX)
+
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	CALL	runtime·mstart(SB)
+
+	MOVQ	$0x1234, 0x1234		// not reached
+	RET
+
+// This is needed by asm_amd64.s
+TEXT runtime·settls(SB),NOSPLIT,$0
+	RET
+
+// void sigtramp(void *ureg, int8 *note)
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	get_tls(AX)
+
+	// check that g exists
+	MOVQ	g(AX), BX
+	CMPQ	BX, $0
+	JNE	3(PC)
+	CALL	runtime·badsignal2(SB) // will exit
+	RET
+
+	// save args
+	MOVQ	ureg+8(SP), CX
+	MOVQ	note+16(SP), DX
+
+	// change stack
+	MOVQ	g_m(BX), BX
+	MOVQ	m_gsignal(BX), R10
+	MOVQ	(g_stack+stack_hi)(R10), BP
+	MOVQ	BP, SP
+
+	// make room for args and g
+	SUBQ	$128, SP
+
+	// save g
+	MOVQ	g(AX), BP
+	MOVQ	BP, 32(SP)
+
+	// g = m->gsignal
+	MOVQ	R10, g(AX)
+
+	// load args and call sighandler
+	MOVQ	CX, 0(SP)
+	MOVQ	DX, 8(SP)
+	MOVQ	BP, 16(SP)
+
+	CALL	runtime·sighandler(SB)
+	MOVL	24(SP), AX
+
+	// restore g
+	get_tls(BX)
+	MOVQ	32(SP), R10
+	MOVQ	R10, g(BX)
+
+	// call noted(AX)
+	MOVQ	AX, 0(SP)
+	CALL	runtime·noted(SB)
+	RET
+
+TEXT runtime·setfpmasks(SB),NOSPLIT,$8
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	$(0x3F<<7), AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
+
+#define ERRMAX 128	/* from os_plan9.h */
+
+// void errstr(int8 *buf, int32 len)
+TEXT errstr<>(SB),NOSPLIT,$0
+	MOVQ    $41, BP
+	SYSCALL
+	RET
+
+// func errstr() string
+// Only used by package syscall.
+// Grab error string due to a syscall made
+// in entersyscall mode, without going
+// through the allocator (issue 4994).
+// See ../syscall/asm_plan9_amd64.s:/·Syscall/
+TEXT runtime·errstr(SB),NOSPLIT,$16-16
+	get_tls(AX)
+	MOVQ	g(AX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_errstr(BX), CX
+	MOVQ	CX, 0(SP)
+	MOVQ	$ERRMAX, 8(SP)
+	CALL	errstr<>(SB)
+	CALL	runtime·findnull(SB)
+	MOVQ	8(SP), AX
+	MOVQ	AX, ret_len+8(FP)
+	MOVQ	0(SP), AX
+	MOVQ	AX, ret_base+0(FP)
+	RET
diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s
new file mode 100644
index 000000000..0ebdab6ee
--- /dev/null
+++ b/src/runtime/sys_solaris_amd64.s
@@ -0,0 +1,351 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for AMD64, SunOS
+// /usr/include/sys/syscall.h for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// This is needed by asm_amd64.s
+TEXT runtime·settls(SB),NOSPLIT,$8
+	RET
+
+// void libc·miniterrno(void *(*___errno)(void));
+//
+// Set the TLS errno pointer in M.
+//
+// Called using runtime·asmcgocall from os_solaris.c:/minit.
+// NOT USING GO CALLING CONVENTION.
+TEXT runtime·miniterrno(SB),NOSPLIT,$0
+	// asmcgocall will put first argument into DI.
+	CALL	DI	// SysV ABI so returns in AX
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	AX,	m_perrno(BX)
+	RET
+
+// int64 runtime·nanotime1(void);
+//
+// clock_gettime(3c) wrapper because Timespec is too large for
+// runtime·nanotime stack.
+//
+// Called using runtime·sysvicall6 from os_solaris.c:/nanotime.
+// NOT USING GO CALLING CONVENTION.
+TEXT runtime·nanotime1(SB),NOSPLIT,$0
+	// need space for the timespec argument.
+	SUBQ	$64, SP	// 16 bytes will do, but who knows in the future?
+	MOVQ	$3, DI	// CLOCK_REALTIME from <sys/time_impl.h>
+	MOVQ	SP, SI
+	MOVQ	libc·clock_gettime(SB), AX
+	CALL	AX
+	MOVQ	(SP), AX	// tv_sec from struct timespec
+	IMULQ	$1000000000, AX	// multiply into nanoseconds
+	ADDQ	8(SP), AX	// tv_nsec, offset should be stable.
+	ADDQ	$64, SP
+	RET
+
+// pipe(3c) wrapper that returns fds in AX, DX.
+// NOT USING GO CALLING CONVENTION.
+TEXT runtime·pipe1(SB),NOSPLIT,$0
+	SUBQ	$16, SP // 8 bytes will do, but stack has to be 16-byte alligned
+	MOVQ	SP, DI
+	MOVQ	libc·pipe(SB), AX
+	CALL	AX
+	MOVL	0(SP), AX
+	MOVL	4(SP), DX
+	ADDQ	$16, SP
+	RET
+
+// Call a library function with SysV calling conventions.
+// The called function can take a maximum of 6 INTEGER class arguments,
+// see 
+//   Michael Matz, Jan Hubicka, Andreas Jaeger, and Mark Mitchell
+//   System V Application Binary Interface 
+//   AMD64 Architecture Processor Supplement
+// section 3.2.3.
+//
+// Called by runtime·asmcgocall or runtime·cgocall.
+// NOT USING GO CALLING CONVENTION.
+TEXT runtime·asmsysvicall6(SB),NOSPLIT,$0
+	// asmcgocall will put first argument into DI.
+	PUSHQ	DI			// save for later
+	MOVQ	libcall_fn(DI), AX
+	MOVQ	libcall_args(DI), R11
+	MOVQ	libcall_n(DI), R10
+
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_perrno(BX), DX
+	CMPQ	DX, $0
+	JEQ	skiperrno1
+	MOVL	$0, 0(DX)
+
+skiperrno1:
+	CMPQ	R11, $0
+	JEQ	skipargs
+	// Load 6 args into correspondent registers.
+	MOVQ	0(R11), DI
+	MOVQ	8(R11), SI
+	MOVQ	16(R11), DX
+	MOVQ	24(R11), CX
+	MOVQ	32(R11), R8
+	MOVQ	40(R11), R9
+skipargs:
+
+	// Call SysV function
+	CALL	AX
+
+	// Return result
+	POPQ	DI
+	MOVQ	AX, libcall_r1(DI)
+	MOVQ	DX, libcall_r2(DI)
+
+	get_tls(CX)
+	MOVQ	g(CX), BX
+	MOVQ	g_m(BX), BX
+	MOVQ	m_perrno(BX), AX
+	CMPQ	AX, $0
+	JEQ	skiperrno2
+	MOVL	0(AX), AX
+	MOVQ	AX, libcall_err(DI)
+
+skiperrno2:	
+	RET
+
+// uint32 tstart_sysvicall(M *newm);
+TEXT runtime·tstart_sysvicall(SB),NOSPLIT,$0
+	// DI contains first arg newm
+	MOVQ	m_g0(DI), DX		// g
+
+	// Make TLS entries point at g and m.
+	get_tls(BX)
+	MOVQ	DX, g(BX)
+	MOVQ	DI, g_m(DX)
+
+	// Layout new m scheduler stack on os stack.
+	MOVQ	SP, AX
+	MOVQ	AX, (g_stack+stack_hi)(DX)
+	SUBQ	$(0x100000), AX		// stack size
+	MOVQ	AX, (g_stack+stack_lo)(DX)
+	ADDQ	$const_StackGuard, AX
+	MOVQ	AX, g_stackguard0(DX)
+	MOVQ	AX, g_stackguard1(DX)
+
+	// Someday the convention will be D is always cleared.
+	CLD
+
+	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
+	CALL	runtime·mstart(SB)
+
+	XORL	AX, AX			// return 0 == success
+	MOVL	AX, ret+8(FP)
+	RET
+
+// Careful, this is called by __sighndlr, a libc function. We must preserve
+// registers as per AMD 64 ABI.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Note that we are executing on altsigstack here, so we have
+	// more stack available than NOSPLIT would have us believe.
+	// To defeat the linker, we make our own stack frame with
+	// more space:
+	SUBQ    $184, SP
+
+	// save registers
+	MOVQ    BX, 32(SP)
+	MOVQ    BP, 40(SP)
+	MOVQ	R12, 48(SP)
+	MOVQ	R13, 56(SP)
+	MOVQ	R14, 64(SP)
+	MOVQ	R15, 72(SP)
+
+	get_tls(BX)
+	// check that g exists
+	MOVQ	g(BX), R10
+	CMPQ	R10, $0
+	JNE	allgood
+	MOVQ	DI, 0(SP)
+	MOVQ	$runtime·badsignal(SB), AX
+	CALL	AX
+	JMP	exit
+
+allgood:
+	// save g
+	MOVQ	R10, 80(SP)
+
+	// Save m->libcall and m->scratch. We need to do this because we
+	// might get interrupted by a signal in runtime·asmcgocall.
+
+	// save m->libcall 
+	MOVQ	g_m(R10), BP
+	LEAQ	m_libcall(BP), R11
+	MOVQ	libcall_fn(R11), R10
+	MOVQ	R10, 88(SP)
+	MOVQ	libcall_args(R11), R10
+	MOVQ	R10, 96(SP)
+	MOVQ	libcall_n(R11), R10
+	MOVQ	R10, 104(SP)
+	MOVQ    libcall_r1(R11), R10
+	MOVQ    R10, 168(SP)
+	MOVQ    libcall_r2(R11), R10
+	MOVQ    R10, 176(SP)
+
+	// save m->scratch
+	LEAQ	m_scratch(BP), R11
+	MOVQ	0(R11), R10
+	MOVQ	R10, 112(SP)
+	MOVQ	8(R11), R10
+	MOVQ	R10, 120(SP)
+	MOVQ	16(R11), R10
+	MOVQ	R10, 128(SP)
+	MOVQ	24(R11), R10
+	MOVQ	R10, 136(SP)
+	MOVQ	32(R11), R10
+	MOVQ	R10, 144(SP)
+	MOVQ	40(R11), R10
+	MOVQ	R10, 152(SP)
+
+	// save errno, it might be EINTR; stuff we do here might reset it.
+	MOVQ	m_perrno(BP), R10
+	MOVL	0(R10), R10
+	MOVQ	R10, 160(SP)
+
+	MOVQ	g(BX), R10
+	// g = m->gsignal
+	MOVQ	m_gsignal(BP), BP
+	MOVQ	BP, g(BX)
+
+	// prepare call
+	MOVQ	DI, 0(SP)
+	MOVQ	SI, 8(SP)
+	MOVQ	DX, 16(SP)
+	MOVQ	R10, 24(SP)
+	CALL	runtime·sighandler(SB)
+
+	get_tls(BX)
+	MOVQ	g(BX), BP
+	MOVQ	g_m(BP), BP
+	// restore libcall
+	LEAQ	m_libcall(BP), R11
+	MOVQ	88(SP), R10
+	MOVQ	R10, libcall_fn(R11)
+	MOVQ	96(SP), R10
+	MOVQ	R10, libcall_args(R11)
+	MOVQ	104(SP), R10
+	MOVQ	R10, libcall_n(R11)
+	MOVQ    168(SP), R10
+	MOVQ    R10, libcall_r1(R11)
+	MOVQ    176(SP), R10
+	MOVQ    R10, libcall_r2(R11)
+
+	// restore scratch
+	LEAQ	m_scratch(BP), R11
+	MOVQ	112(SP), R10
+	MOVQ	R10, 0(R11)
+	MOVQ	120(SP), R10
+	MOVQ	R10, 8(R11)
+	MOVQ	128(SP), R10
+	MOVQ	R10, 16(R11)
+	MOVQ	136(SP), R10
+	MOVQ	R10, 24(R11)
+	MOVQ	144(SP), R10
+	MOVQ	R10, 32(R11)
+	MOVQ	152(SP), R10
+	MOVQ	R10, 40(R11)
+
+	// restore errno
+	MOVQ	m_perrno(BP), R11
+	MOVQ	160(SP), R10
+	MOVL	R10, 0(R11)
+
+	// restore g
+	MOVQ	80(SP), R10
+	MOVQ	R10, g(BX)
+
+exit:
+	// restore registers
+	MOVQ	32(SP), BX
+	MOVQ	40(SP), BP
+	MOVQ	48(SP), R12
+	MOVQ	56(SP), R13
+	MOVQ	64(SP), R14
+	MOVQ	72(SP), R15
+
+	ADDQ    $184, SP
+	RET
+
+// Called from runtime·usleep (Go). Can be called on Go stack, on OS stack,
+// can also be called in cgo callback path without a g->m.
+TEXT runtime·usleep1(SB),NOSPLIT,$0
+	MOVL	usec+0(FP), DI
+	MOVQ	$runtime·usleep2(SB), AX // to hide from 6l
+
+	// Execute call on m->g0.
+	get_tls(R15)
+	CMPQ	R15, $0
+	JE	usleep1_noswitch
+
+	MOVQ	g(R15), R13
+	CMPQ	R13, $0
+	JE	usleep1_noswitch
+	MOVQ	g_m(R13), R13
+	CMPQ	R13, $0
+	JE	usleep1_noswitch
+	// TODO(aram): do something about the cpu profiler here.
+
+	MOVQ	m_g0(R13), R14
+	CMPQ	g(R15), R14
+	JNE	usleep1_switch
+	// executing on m->g0 already
+	CALL	AX
+	RET
+
+usleep1_switch:
+	// Switch to m->g0 stack and back.
+	MOVQ	(g_sched+gobuf_sp)(R14), R14
+	MOVQ	SP, -8(R14)
+	LEAQ	-8(R14), SP
+	CALL	AX
+	MOVQ	0(SP), SP
+	RET
+
+usleep1_noswitch:
+	// Not a Go-managed thread. Do not switch stack.
+	CALL	AX
+	RET
+
+// Runs on OS stack. duration (in µs units) is in DI.
+TEXT runtime·usleep2(SB),NOSPLIT,$0
+	MOVQ	libc·usleep(SB), AX
+	CALL	AX
+	RET
+
+// Runs on OS stack, called from runtime·osyield.
+TEXT runtime·osyield1(SB),NOSPLIT,$0
+	MOVQ	libc·sched_yield(SB), AX
+	CALL	AX
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$8-12
+	CALL	runtime·nanotime(SB)
+	MOVQ	0(SP), AX
+
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	MOVQ	$1360296554856532783, AX
+	MULQ	CX
+	ADDQ	CX, DX
+	RCRQ	$1, DX
+	SHRQ	$29, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
+	RET
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
new file mode 100644
index 000000000..932fe9dd2
--- /dev/null
+++ b/src/runtime/sys_windows_386.s
@@ -0,0 +1,433 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// void runtime·asmstdcall(void *c);
+TEXT runtime·asmstdcall(SB),NOSPLIT,$0
+	MOVL	fn+0(FP), BX
+
+	// SetLastError(0).
+	MOVL	$0, 0x34(FS)
+
+	// Copy args to the stack.
+	MOVL	SP, BP
+	MOVL	libcall_n(BX), CX	// words
+	MOVL	CX, AX
+	SALL	$2, AX
+	SUBL	AX, SP			// room for args
+	MOVL	SP, DI
+	MOVL	libcall_args(BX), SI
+	CLD
+	REP; MOVSL
+
+	// Call stdcall or cdecl function.
+	// DI SI BP BX are preserved, SP is not
+	CALL	libcall_fn(BX)
+	MOVL	BP, SP
+
+	// Return result.
+	MOVL	fn+0(FP), BX
+	MOVL	AX, libcall_r1(BX)
+	MOVL	DX, libcall_r2(BX)
+
+	// GetLastError().
+	MOVL	0x34(FS), AX
+	MOVL	AX, libcall_err(BX)
+
+	RET
+
+TEXT	runtime·badsignal2(SB),NOSPLIT,$24
+	// stderr
+	MOVL	$-12, 0(SP)
+	MOVL	SP, BP
+	CALL	*runtime·GetStdHandle(SB)
+	MOVL	BP, SP
+
+	MOVL	AX, 0(SP)	// handle
+	MOVL	$runtime·badsignalmsg(SB), DX // pointer
+	MOVL	DX, 4(SP)
+	MOVL	runtime·badsignallen(SB), DX // count
+	MOVL	DX, 8(SP)
+	LEAL	20(SP), DX  // written count
+	MOVL	$0, 0(DX)
+	MOVL	DX, 12(SP)
+	MOVL	$0, 16(SP) // overlapped
+	CALL	*runtime·WriteFile(SB)
+	MOVL	BP, SI
+	RET
+
+// faster get/set last error
+TEXT runtime·getlasterror(SB),NOSPLIT,$0
+	MOVL	0x34(FS), AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·setlasterror(SB),NOSPLIT,$0
+	MOVL	err+0(FP), AX
+	MOVL	AX, 0x34(FS)
+	RET
+
+// Called by Windows as a Vectored Exception Handler (VEH).
+// First argument is pointer to struct containing
+// exception record and context pointers.
+// Handler function is stored in AX.
+// Return 0 for 'not handled', -1 for handled.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0-0
+	MOVL	ptrs+0(FP), CX
+	SUBL	$40, SP
+
+	// save callee-saved registers
+	MOVL	BX, 28(SP)
+	MOVL	BP, 16(SP)
+	MOVL	SI, 20(SP)
+	MOVL	DI, 24(SP)
+
+	MOVL	AX, SI	// save handler address
+
+	// find g
+	get_tls(DX)
+	CMPL	DX, $0
+	JNE	3(PC)
+	MOVL	$0, AX // continue
+	JMP	done
+	MOVL	g(DX), DX
+	CMPL	DX, $0
+	JNE	2(PC)
+	CALL	runtime·badsignal2(SB)
+
+	// save g and SP in case of stack switch
+	MOVL	DX, 32(SP)	// g
+	MOVL	SP, 36(SP)
+
+	// do we need to switch to the g0 stack?
+	MOVL	g_m(DX), BX
+	MOVL	m_g0(BX), BX
+	CMPL	DX, BX
+	JEQ	sigtramp_g0
+
+	// switch to the g0 stack
+	get_tls(BP)
+	MOVL	BX, g(BP)
+	MOVL	(g_sched+gobuf_sp)(BX), DI
+	// make it look like mstart called us on g0, to stop traceback
+	SUBL	$4, DI
+	MOVL	$runtime·mstart(SB), 0(DI)
+	// traceback will think that we've done SUBL
+	// on this stack, so subtract them here to match.
+	// (we need room for sighandler arguments anyway).
+	// and re-save old SP for restoring later.
+	SUBL	$40, DI
+	MOVL	SP, 36(DI)
+	MOVL	DI, SP
+
+sigtramp_g0:
+	MOVL	0(CX), BX // ExceptionRecord*
+	MOVL	4(CX), CX // Context*
+	MOVL	BX, 0(SP)
+	MOVL	CX, 4(SP)
+	MOVL	DX, 8(SP)
+	CALL	SI	// call handler
+	// AX is set to report result back to Windows
+	MOVL	12(SP), AX
+
+	// switch back to original stack and g
+	// no-op if we never left.
+	MOVL	36(SP), SP
+	MOVL	32(SP), DX
+	get_tls(BP)
+	MOVL	DX, g(BP)
+
+done:
+	// restore callee-saved registers
+	MOVL	24(SP), DI
+	MOVL	20(SP), SI
+	MOVL	16(SP), BP
+	MOVL	28(SP), BX
+
+	ADDL	$40, SP
+	// RET 4 (return and pop 4 bytes parameters)
+	BYTE $0xC2; WORD $4
+	RET // unreached; make assembler happy
+ 
+TEXT runtime·exceptiontramp(SB),NOSPLIT,$0
+	MOVL	$runtime·exceptionhandler(SB), AX
+	JMP	runtime·sigtramp(SB)
+
+TEXT runtime·firstcontinuetramp(SB),NOSPLIT,$0-0
+	// is never called
+	INT	$3
+
+TEXT runtime·lastcontinuetramp(SB),NOSPLIT,$0-0
+	MOVL	$runtime·lastcontinuehandler(SB), AX
+	JMP	runtime·sigtramp(SB)
+
+TEXT runtime·ctrlhandler(SB),NOSPLIT,$0
+	PUSHL	$runtime·ctrlhandler1(SB)
+	CALL	runtime·externalthreadhandler(SB)
+	MOVL	4(SP), CX
+	ADDL	$12, SP
+	JMP	CX
+
+TEXT runtime·profileloop(SB),NOSPLIT,$0
+	PUSHL	$runtime·profileloop1(SB)
+	CALL	runtime·externalthreadhandler(SB)
+	MOVL	4(SP), CX
+	ADDL	$12, SP
+	JMP	CX
+
+TEXT runtime·externalthreadhandler(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	PUSHL	BX
+	PUSHL	SI
+	PUSHL	DI
+	PUSHL	0x14(FS)
+	MOVL	SP, DX
+
+	// setup dummy m, g
+	SUBL	$m_end, SP		// space for M
+	MOVL	SP, 0(SP)
+	MOVL	$m_end, 4(SP)
+	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+
+	LEAL	m_tls(SP), CX
+	MOVL	CX, 0x14(FS)
+	MOVL	SP, BX
+	SUBL	$g_end, SP		// space for G
+	MOVL	SP, g(CX)
+	MOVL	SP, m_g0(BX)
+
+	MOVL	SP, 0(SP)
+	MOVL	$g_end, 4(SP)
+	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+	LEAL	g_end(SP), BX
+	MOVL	BX, g_m(SP)
+	LEAL	-8192(SP), CX
+	MOVL	CX, (g_stack+stack_lo)(SP)
+	ADDL	$const_StackGuard, CX
+	MOVL	CX, g_stackguard0(SP)
+	MOVL	CX, g_stackguard1(SP)
+	MOVL	DX, (g_stack+stack_hi)(SP)
+
+	PUSHL	16(BP)			// arg for handler
+	CALL	8(BP)
+	POPL	CX
+
+	get_tls(CX)
+	MOVL	g(CX), CX
+	MOVL	(g_stack+stack_hi)(CX), SP
+	POPL	0x14(FS)
+	POPL	DI
+	POPL	SI
+	POPL	BX
+	POPL	BP
+	RET
+
+GLOBL runtime·cbctxts(SB), NOPTR, $4
+
+TEXT runtime·callbackasm1+0(SB),NOSPLIT,$0
+  	MOVL	0(SP), AX	// will use to find our callback context
+
+	// remove return address from stack, we are not returning there
+	ADDL	$4, SP
+
+	// address to callback parameters into CX
+	LEAL	4(SP), CX
+
+	// save registers as required for windows callback
+	PUSHL	DI
+	PUSHL	SI
+	PUSHL	BP
+	PUSHL	BX
+
+	// determine index into runtime·cbctxts table
+	SUBL	$runtime·callbackasm(SB), AX
+	MOVL	$0, DX
+	MOVL	$5, BX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
+	DIVL	BX,
+
+	// find correspondent runtime·cbctxts table entry
+	MOVL	runtime·cbctxts(SB), BX
+	MOVL	-4(BX)(AX*4), BX
+
+	// extract callback context
+	MOVL	cbctxt_gobody(BX), AX
+	MOVL	cbctxt_argsize(BX), DX
+
+	// preserve whatever's at the memory location that
+	// the callback will use to store the return value
+	PUSHL	0(CX)(DX*1)
+
+	// extend argsize by size of return value
+	ADDL	$4, DX
+
+	// remember how to restore stack on return
+	MOVL	cbctxt_restorestack(BX), BX
+	PUSHL	BX
+
+	// call target Go function
+	PUSHL	DX			// argsize (including return value)
+	PUSHL	CX			// callback parameters
+	PUSHL	AX			// address of target Go function
+	CLD
+	CALL	runtime·cgocallback_gofunc(SB)
+	POPL	AX
+	POPL	CX
+	POPL	DX
+
+	// how to restore stack on return
+	POPL	BX
+
+	// return value into AX (as per Windows spec)
+	// and restore previously preserved value
+	MOVL	-4(CX)(DX*1), AX
+	POPL	-4(CX)(DX*1)
+
+	MOVL	BX, CX			// cannot use BX anymore
+
+	// restore registers as required for windows callback
+	POPL	BX
+	POPL	BP
+	POPL	SI
+	POPL	DI
+
+	// remove callback parameters before return (as per Windows spec)
+	POPL	DX
+	ADDL	CX, SP
+	PUSHL	DX
+
+	CLD
+
+	RET
+
+// void tstart(M *newm);
+TEXT runtime·tstart(SB),NOSPLIT,$0
+	MOVL	newm+4(SP), CX		// m
+	MOVL	m_g0(CX), DX		// g
+
+	// Layout new m scheduler stack on os stack.
+	MOVL	SP, AX
+	MOVL	AX, (g_stack+stack_hi)(DX)
+	SUBL	$(64*1024), AX		// stack size
+	MOVL	AX, (g_stack+stack_lo)(DX)
+	ADDL	$const_StackGuard, AX
+	MOVL	AX, g_stackguard0(DX)
+	MOVL	AX, g_stackguard1(DX)
+
+	// Set up tls.
+	LEAL	m_tls(CX), SI
+	MOVL	SI, 0x14(FS)
+	MOVL	CX, g_m(DX)
+	MOVL	DX, g(SI)
+
+	// Someday the convention will be D is always cleared.
+	CLD
+
+	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
+	CALL	runtime·mstart(SB)
+
+	RET
+
+// uint32 tstart_stdcall(M *newm);
+TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
+	MOVL	newm+4(SP), BX
+
+	PUSHL	BX
+	CALL	runtime·tstart(SB)
+	POPL	BX
+
+	// Adjust stack for stdcall to return properly.
+	MOVL	(SP), AX		// save return address
+	ADDL	$4, SP			// remove single parameter
+	MOVL	AX, (SP)		// restore return address
+
+	XORL	AX, AX			// return 0 == success
+
+	RET
+
+// setldt(int entry, int address, int limit)
+TEXT runtime·setldt(SB),NOSPLIT,$0
+	MOVL	address+4(FP), CX
+	MOVL	CX, 0x14(FS)
+	RET
+
+// Sleep duration is in 100ns units.
+TEXT runtime·usleep1(SB),NOSPLIT,$0
+	MOVL	usec+0(FP), BX
+	MOVL	$runtime·usleep2(SB), AX // to hide from 8l
+
+	// Execute call on m->g0 stack, in case we are not actually
+	// calling a system call wrapper, like when running under WINE.
+	get_tls(CX)
+	CMPL	CX, $0
+	JNE	3(PC)
+	// Not a Go-managed thread. Do not switch stack.
+	CALL	AX
+	RET
+
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
+
+	// leave pc/sp for cpu profiler
+	MOVL	(SP), SI
+	MOVL	SI, m_libcallpc(BP)
+	MOVL	g(CX), SI
+	MOVL	SI, m_libcallg(BP)
+	// sp must be the last, because once async cpu profiler finds
+	// all three values to be non-zero, it will use them
+	LEAL	usec+0(FP), SI
+	MOVL	SI, m_libcallsp(BP)
+
+	MOVL	m_g0(BP), SI
+	CMPL	g(CX), SI
+	JNE	usleep1_switch
+	// executing on m->g0 already
+	CALL	AX
+	JMP	usleep1_ret
+
+usleep1_switch:
+	// Switch to m->g0 stack and back.
+	MOVL	(g_sched+gobuf_sp)(SI), SI
+	MOVL	SP, -4(SI)
+	LEAL	-4(SI), SP
+	CALL	AX
+	MOVL	0(SP), SP
+
+usleep1_ret:
+	get_tls(CX)
+	MOVL	g(CX), BP
+	MOVL	g_m(BP), BP
+	MOVL	$0, m_libcallsp(BP)
+	RET
+
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2(SB),NOSPLIT,$20
+	// Want negative 100ns units.
+	NEGL	BX
+	MOVL	$-1, hi-4(SP)
+	MOVL	BX, lo-8(SP)
+	LEAL	lo-8(SP), BX
+	MOVL	BX, ptime-12(SP)
+	MOVL	$0, alertable-16(SP)
+	MOVL	$-1, handle-20(SP)
+	MOVL	SP, BP
+	MOVL	runtime·NtWaitForSingleObject(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$8-12
+	CALL	runtime·unixnano(SB)
+	MOVL	0(SP), AX
+	MOVL	4(SP), DX
+
+	MOVL	$1000000000, CX
+	DIVL	CX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
new file mode 100644
index 000000000..e6190ce68
--- /dev/null
+++ b/src/runtime/sys_windows_amd64.s
@@ -0,0 +1,462 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+// maxargs should be divisible by 2, as Windows stack
+// must be kept 16-byte aligned on syscall entry.
+#define maxargs 16
+
+// void runtime·asmstdcall(void *c);
+TEXT runtime·asmstdcall(SB),NOSPLIT,$0
+	// asmcgocall will put first argument into CX.
+	PUSHQ	CX			// save for later
+	MOVQ	libcall_fn(CX), AX
+	MOVQ	libcall_args(CX), SI
+	MOVQ	libcall_n(CX), CX
+
+	// SetLastError(0).
+	MOVQ	0x30(GS), DI
+	MOVL	$0, 0x68(DI)
+
+	SUBQ	$(maxargs*8), SP	// room for args
+
+	// Fast version, do not store args on the stack.
+	CMPL	CX, $4
+	JLE	loadregs
+
+	// Check we have enough room for args.
+	CMPL	CX, $maxargs
+	JLE	2(PC)
+	INT	$3			// not enough room -> crash
+
+	// Copy args to the stack.
+	MOVQ	SP, DI
+	CLD
+	REP; MOVSQ
+	MOVQ	SP, SI
+
+loadregs:
+	// Load first 4 args into correspondent registers.
+	MOVQ	0(SI), CX
+	MOVQ	8(SI), DX
+	MOVQ	16(SI), R8
+	MOVQ	24(SI), R9
+
+	// Call stdcall function.
+	CALL	AX
+
+	ADDQ	$(maxargs*8), SP
+
+	// Return result.
+	POPQ	CX
+	MOVQ	AX, libcall_r1(CX)
+
+	// GetLastError().
+	MOVQ	0x30(GS), DI
+	MOVL	0x68(DI), AX
+	MOVQ	AX, libcall_err(CX)
+
+	RET
+
+TEXT runtime·badsignal2(SB),NOSPLIT,$48
+	// stderr
+	MOVQ	$-12, CX // stderr
+	MOVQ	CX, 0(SP)
+	MOVQ	runtime·GetStdHandle(SB), AX
+	CALL	AX
+
+	MOVQ	AX, CX	// handle
+	MOVQ	CX, 0(SP)
+	MOVQ	$runtime·badsignalmsg(SB), DX // pointer
+	MOVQ	DX, 8(SP)
+	MOVL	$runtime·badsignallen(SB), R8 // count
+	MOVQ	R8, 16(SP)
+	LEAQ	40(SP), R9  // written count
+	MOVQ	$0, 0(R9)
+	MOVQ	R9, 24(SP)
+	MOVQ	$0, 32(SP)	// overlapped
+	MOVQ	runtime·WriteFile(SB), AX
+	CALL	AX
+	
+	RET
+
+// faster get/set last error
+TEXT runtime·getlasterror(SB),NOSPLIT,$0
+	MOVQ	0x30(GS), AX
+	MOVL	0x68(AX), AX
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·setlasterror(SB),NOSPLIT,$0
+	MOVL	err+0(FP), AX
+	MOVQ	0x30(GS),	CX
+	MOVL	AX, 0x68(CX)
+	RET
+
+// Called by Windows as a Vectored Exception Handler (VEH).
+// First argument is pointer to struct containing
+// exception record and context pointers.
+// Handler function is stored in AX.
+// Return 0 for 'not handled', -1 for handled.
+TEXT runtime·sigtramp(SB),NOSPLIT,$0-0
+	// CX: PEXCEPTION_POINTERS ExceptionInfo
+
+	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
+	// as required by windows callback convention.
+	PUSHFQ
+	SUBQ	$112, SP
+	MOVQ	DI, 80(SP)
+	MOVQ	SI, 72(SP)
+	MOVQ	BP, 64(SP)
+	MOVQ	BX, 56(SP)
+	MOVQ	R12, 48(SP)
+	MOVQ	R13, 40(SP)
+	MOVQ	R14, 32(SP)
+	MOVQ	R15, 88(SP)
+
+	MOVQ	AX, R15	// save handler address
+
+	// find g
+	get_tls(DX)
+	CMPQ	DX, $0
+	JNE	3(PC)
+	MOVQ	$0, AX // continue
+	JMP	done
+	MOVQ	g(DX), DX
+	CMPQ	DX, $0
+	JNE	2(PC)
+	CALL	runtime·badsignal2(SB)
+
+	// save g and SP in case of stack switch
+	MOVQ	DX, 96(SP) // g
+	MOVQ	SP, 104(SP)
+
+	// do we need to switch to the g0 stack?
+	MOVQ	g_m(DX), BX
+	MOVQ	m_g0(BX), BX
+	CMPQ	DX, BX
+	JEQ	sigtramp_g0
+
+	// switch to g0 stack
+	get_tls(BP)
+	MOVQ	BX, g(BP)
+	MOVQ	(g_sched+gobuf_sp)(BX), DI
+	// make it look like mstart called us on g0, to stop traceback
+	SUBQ	$8, DI
+	MOVQ	$runtime·mstart(SB), SI
+	MOVQ	SI, 0(DI)
+	// traceback will think that we've done PUSHFQ and SUBQ
+	// on this stack, so subtract them here to match.
+	// (we need room for sighandler arguments anyway).
+	// and re-save old SP for restoring later.
+	SUBQ	$(112+8), DI
+	// save g, save old stack pointer.
+	MOVQ	SP, 104(DI)
+	MOVQ	DI, SP
+
+sigtramp_g0:
+	MOVQ	0(CX), BX // ExceptionRecord*
+	MOVQ	8(CX), CX // Context*
+	MOVQ	BX, 0(SP)
+	MOVQ	CX, 8(SP)
+	MOVQ	DX, 16(SP)
+	CALL	R15	// call handler
+	// AX is set to report result back to Windows
+	MOVL	24(SP), AX
+
+	// switch back to original stack and g
+	// no-op if we never left.
+	MOVQ	104(SP), SP
+	MOVQ	96(SP), DX
+	get_tls(BP)
+	MOVQ	DX, g(BP)
+
+done:
+	// restore registers as required for windows callback
+	MOVQ	88(SP), R15
+	MOVQ	32(SP), R14
+	MOVQ	40(SP), R13
+	MOVQ	48(SP), R12
+	MOVQ	56(SP), BX
+	MOVQ	64(SP), BP
+	MOVQ	72(SP), SI
+	MOVQ	80(SP), DI
+	ADDQ	$112, SP
+	POPFQ
+
+	RET
+
+TEXT runtime·exceptiontramp(SB),NOSPLIT,$0
+	MOVQ	$runtime·exceptionhandler(SB), AX
+	JMP	runtime·sigtramp(SB)
+
+TEXT runtime·firstcontinuetramp(SB),NOSPLIT,$0-0
+	MOVQ	$runtime·firstcontinuehandler(SB), AX
+	JMP	runtime·sigtramp(SB)
+
+TEXT runtime·lastcontinuetramp(SB),NOSPLIT,$0-0
+	MOVQ	$runtime·lastcontinuehandler(SB), AX
+	JMP	runtime·sigtramp(SB)
+
+TEXT runtime·ctrlhandler(SB),NOSPLIT,$8
+	MOVQ	CX, 16(SP)		// spill
+	MOVQ	$runtime·ctrlhandler1(SB), CX
+	MOVQ	CX, 0(SP)
+	CALL	runtime·externalthreadhandler(SB)
+	RET
+
+TEXT runtime·profileloop(SB),NOSPLIT,$8
+	MOVQ	$runtime·profileloop1(SB), CX
+	MOVQ	CX, 0(SP)
+	CALL	runtime·externalthreadhandler(SB)
+	RET
+
+TEXT runtime·externalthreadhandler(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	PUSHQ	BX
+	PUSHQ	SI
+	PUSHQ	DI
+	PUSHQ	0x28(GS)
+	MOVQ	SP, DX
+
+	// setup dummy m, g
+	SUBQ	$m_end, SP		// space for M
+	MOVQ	SP, 0(SP)
+	MOVQ	$m_end, 8(SP)
+	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+
+	LEAQ	m_tls(SP), CX
+	MOVQ	CX, 0x28(GS)
+	MOVQ	SP, BX
+	SUBQ	$g_end, SP		// space for G
+	MOVQ	SP, g(CX)
+	MOVQ	SP, m_g0(BX)
+
+	MOVQ	SP, 0(SP)
+	MOVQ	$g_end, 8(SP)
+	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+	LEAQ	g_end(SP), BX
+	MOVQ	BX, g_m(SP)
+
+	LEAQ	-8192(SP), CX
+	MOVQ	CX, (g_stack+stack_lo)(SP)
+	ADDQ	$const_StackGuard, CX
+	MOVQ	CX, g_stackguard0(SP)
+	MOVQ	CX, g_stackguard1(SP)
+	MOVQ	DX, (g_stack+stack_hi)(SP)
+
+	PUSHQ	32(BP)			// arg for handler
+	CALL	16(BP)
+	POPQ	CX
+
+	get_tls(CX)
+	MOVQ	g(CX), CX
+	MOVQ	(g_stack+stack_hi)(CX), SP
+	POPQ	0x28(GS)
+	POPQ	DI
+	POPQ	SI
+	POPQ	BX
+	POPQ	BP
+	RET
+
+GLOBL runtime·cbctxts(SB), NOPTR, $8
+
+TEXT runtime·callbackasm1(SB),NOSPLIT,$0
+	// Construct args vector for cgocallback().
+	// By windows/amd64 calling convention first 4 args are in CX, DX, R8, R9
+	// args from the 5th on are on the stack.
+	// In any case, even if function has 0,1,2,3,4 args, there is reserved
+	// but uninitialized "shadow space" for the first 4 args.
+	// The values are in registers.
+  	MOVQ	CX, (16+0)(SP)
+  	MOVQ	DX, (16+8)(SP)
+  	MOVQ	R8, (16+16)(SP)
+  	MOVQ	R9, (16+24)(SP)
+
+	// remove return address from stack, we are not returning there
+  	MOVQ	0(SP), AX
+	ADDQ	$8, SP
+
+	// determine index into runtime·cbctxts table
+	MOVQ	$runtime·callbackasm(SB), DX
+	SUBQ	DX, AX
+	MOVQ	$0, DX
+	MOVQ	$5, CX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
+	DIVL	CX,
+
+	// find correspondent runtime·cbctxts table entry
+	MOVQ	runtime·cbctxts(SB), CX
+	MOVQ	-8(CX)(AX*8), AX
+
+	// extract callback context
+	MOVQ	cbctxt_argsize(AX), DX
+	MOVQ	cbctxt_gobody(AX), AX
+
+	// preserve whatever's at the memory location that
+	// the callback will use to store the return value
+	LEAQ	8(SP), CX       // args vector, skip return address
+	PUSHQ	0(CX)(DX*1)     // store 8 bytes from just after the args array
+	ADDQ	$8, DX          // extend argsize by size of return value
+
+	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
+	// as required by windows callback convention.
+	PUSHFQ
+	SUBQ	$64, SP
+	MOVQ	DI, 56(SP)
+	MOVQ	SI, 48(SP)
+	MOVQ	BP, 40(SP)
+	MOVQ	BX, 32(SP)
+	MOVQ	R12, 24(SP)
+	MOVQ	R13, 16(SP)
+	MOVQ	R14, 8(SP)
+	MOVQ	R15, 0(SP)
+
+	// prepare call stack.  use SUBQ to hide from stack frame checks
+	// cgocallback(Go func, void *frame, uintptr framesize)
+	SUBQ	$24, SP
+	MOVQ	DX, 16(SP)	// argsize (including return value)
+	MOVQ	CX, 8(SP)	// callback parameters
+	MOVQ	AX, 0(SP)	// address of target Go function
+	CLD
+	CALL	runtime·cgocallback_gofunc(SB)
+	MOVQ	0(SP), AX
+	MOVQ	8(SP), CX
+	MOVQ	16(SP), DX
+	ADDQ	$24, SP
+
+	// restore registers as required for windows callback
+	MOVQ	0(SP), R15
+	MOVQ	8(SP), R14
+	MOVQ	16(SP), R13
+	MOVQ	24(SP), R12
+	MOVQ	32(SP), BX
+	MOVQ	40(SP), BP
+	MOVQ	48(SP), SI
+	MOVQ	56(SP), DI
+	ADDQ	$64, SP
+	POPFQ
+
+	MOVL	-8(CX)(DX*1), AX  // return value
+	POPQ	-8(CX)(DX*1)      // restore bytes just after the args
+	RET
+
+// uint32 tstart_stdcall(M *newm);
+TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
+	// CX contains first arg newm
+	MOVQ	m_g0(CX), DX		// g
+
+	// Layout new m scheduler stack on os stack.
+	MOVQ	SP, AX
+	MOVQ	AX, (g_stack+stack_hi)(DX)
+	SUBQ	$(64*1024), AX		// stack size
+	MOVQ	AX, (g_stack+stack_lo)(DX)
+	ADDQ	$const_StackGuard, AX
+	MOVQ	AX, g_stackguard0(DX)
+	MOVQ	AX, g_stackguard1(DX)
+
+	// Set up tls.
+	LEAQ	m_tls(CX), SI
+	MOVQ	SI, 0x28(GS)
+	MOVQ	CX, g_m(DX)
+	MOVQ	DX, g(SI)
+
+	// Someday the convention will be D is always cleared.
+	CLD
+
+	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
+	CALL	runtime·mstart(SB)
+
+	XORL	AX, AX			// return 0 == success
+	RET
+
+// set tls base to DI
+TEXT runtime·settls(SB),NOSPLIT,$0
+	MOVQ	DI, 0x28(GS)
+	RET
+
+// Sleep duration is in 100ns units.
+TEXT runtime·usleep1(SB),NOSPLIT,$0
+	MOVL	usec+0(FP), BX
+	MOVQ	$runtime·usleep2(SB), AX // to hide from 6l
+
+	// Execute call on m->g0 stack, in case we are not actually
+	// calling a system call wrapper, like when running under WINE.
+	get_tls(R15)
+	CMPQ	R15, $0
+	JNE	3(PC)
+	// Not a Go-managed thread. Do not switch stack.
+	CALL	AX
+	RET
+
+	MOVQ	g(R15), R13
+	MOVQ	g_m(R13), R13
+
+	// leave pc/sp for cpu profiler
+	MOVQ	(SP), R12
+	MOVQ	R12, m_libcallpc(R13)
+	MOVQ	g(R15), R12
+	MOVQ	R12, m_libcallg(R13)
+	// sp must be the last, because once async cpu profiler finds
+	// all three values to be non-zero, it will use them
+	LEAQ	usec+0(FP), R12
+	MOVQ	R12, m_libcallsp(R13)
+
+	MOVQ	m_g0(R13), R14
+	CMPQ	g(R15), R14
+	JNE	usleep1_switch
+	// executing on m->g0 already
+	CALL	AX
+	JMP	usleep1_ret
+
+usleep1_switch:
+	// Switch to m->g0 stack and back.
+	MOVQ	(g_sched+gobuf_sp)(R14), R14
+	MOVQ	SP, -8(R14)
+	LEAQ	-8(R14), SP
+	CALL	AX
+	MOVQ	0(SP), SP
+
+usleep1_ret:
+	MOVQ	$0, m_libcallsp(R13)
+	RET
+
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2(SB),NOSPLIT,$16
+	MOVQ	SP, AX
+	ANDQ	$~15, SP	// alignment as per Windows requirement
+	MOVQ	AX, 8(SP)
+	// Want negative 100ns units.
+	NEGQ	BX
+	MOVQ	SP, R8 // ptime
+	MOVQ	BX, (R8)
+	MOVQ	$-1, CX // handle
+	MOVQ	$0, DX // alertable
+	MOVQ	runtime·NtWaitForSingleObject(SB), AX
+	CALL	AX
+	MOVQ	8(SP), SP
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$8-12
+	CALL	runtime·unixnano(SB)
+	MOVQ	0(SP), AX
+
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	MOVQ	$1360296554856532783, AX
+	MULQ	CX
+	ADDQ	CX, DX
+	RCRQ	$1, DX
+	SHRQ	$29, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
+	RET
+
diff --git a/src/runtime/sys_x86.c b/src/runtime/sys_x86.c
new file mode 100644
index 000000000..a450b3e58
--- /dev/null
+++ b/src/runtime/sys_x86.c
@@ -0,0 +1,57 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32 386
+
+#include "runtime.h"
+
+// adjust Gobuf as it if executed a call to fn with context ctxt
+// and then did an immediate gosave.
+void
+runtime·gostartcall(Gobuf *gobuf, void (*fn)(void), void *ctxt)
+{
+	uintptr *sp;
+	
+	sp = (uintptr*)gobuf->sp;
+	if(sizeof(uintreg) > sizeof(uintptr))
+		*--sp = 0;
+	*--sp = (uintptr)gobuf->pc;
+	gobuf->sp = (uintptr)sp;
+	gobuf->pc = (uintptr)fn;
+	gobuf->ctxt = ctxt;
+}
+
+// Called to rewind context saved during morestack back to beginning of function.
+// To help us, the linker emits a jmp back to the beginning right after the
+// call to morestack. We just have to decode and apply that jump.
+void
+runtime·rewindmorestack(Gobuf *gobuf)
+{
+	byte *pc;
+
+	pc = (byte*)gobuf->pc;
+	if(pc[0] == 0xe9) { // jmp 4-byte offset
+		gobuf->pc = gobuf->pc + 5 + *(int32*)(pc+1);
+		return;
+	}
+	if(pc[0] == 0xeb) { // jmp 1-byte offset
+		gobuf->pc = gobuf->pc + 2 + *(int8*)(pc+1);
+		return;
+	}
+	if(pc[0] == 0xcc) {
+		// This is a breakpoint inserted by gdb.  We could use
+		// runtime·findfunc to find the function.  But if we
+		// do that, then we will continue execution at the
+		// function entry point, and we will not hit the gdb
+		// breakpoint.  So for this case we don't change
+		// gobuf->pc, so that when we return we will execute
+		// the jump instruction and carry on.  This means that
+		// stack unwinding may not work entirely correctly
+		// (http://golang.org/issue/5723) but the user is
+		// running under gdb anyhow.
+		return;
+	}
+	runtime·printf("runtime: pc=%p %x %x %x %x %x\n", pc, pc[0], pc[1], pc[2], pc[3], pc[4]);
+	runtime·throw("runtime: misuse of rewindmorestack");
+}
diff --git a/src/runtime/syscall_nacl.h b/src/runtime/syscall_nacl.h
new file mode 100644
index 000000000..b33852ec8
--- /dev/null
+++ b/src/runtime/syscall_nacl.h
@@ -0,0 +1,71 @@
+// generated by mknacl.sh - do not edit
+#define SYS_null 1
+#define SYS_nameservice 2
+#define SYS_dup 8
+#define SYS_dup2 9
+#define SYS_open 10
+#define SYS_close 11
+#define SYS_read 12
+#define SYS_write 13
+#define SYS_lseek 14
+#define SYS_ioctl 15
+#define SYS_stat 16
+#define SYS_fstat 17
+#define SYS_chmod 18
+#define SYS_brk 20
+#define SYS_mmap 21
+#define SYS_munmap 22
+#define SYS_getdents 23
+#define SYS_mprotect 24
+#define SYS_list_mappings 25
+#define SYS_exit 30
+#define SYS_getpid 31
+#define SYS_sched_yield 32
+#define SYS_sysconf 33
+#define SYS_gettimeofday 40
+#define SYS_clock 41
+#define SYS_nanosleep 42
+#define SYS_clock_getres 43
+#define SYS_clock_gettime 44
+#define SYS_mkdir 45
+#define SYS_rmdir 46
+#define SYS_chdir 47
+#define SYS_getcwd 48
+#define SYS_unlink 49
+#define SYS_imc_makeboundsock 60
+#define SYS_imc_accept 61
+#define SYS_imc_connect 62
+#define SYS_imc_sendmsg 63
+#define SYS_imc_recvmsg 64
+#define SYS_imc_mem_obj_create 65
+#define SYS_imc_socketpair 66
+#define SYS_mutex_create 70
+#define SYS_mutex_lock 71
+#define SYS_mutex_trylock 72
+#define SYS_mutex_unlock 73
+#define SYS_cond_create 74
+#define SYS_cond_wait 75
+#define SYS_cond_signal 76
+#define SYS_cond_broadcast 77
+#define SYS_cond_timed_wait_abs 79
+#define SYS_thread_create 80
+#define SYS_thread_exit 81
+#define SYS_tls_init 82
+#define SYS_thread_nice 83
+#define SYS_tls_get 84
+#define SYS_second_tls_set 85
+#define SYS_second_tls_get 86
+#define SYS_exception_handler 87
+#define SYS_exception_stack 88
+#define SYS_exception_clear_flag 89
+#define SYS_sem_create 100
+#define SYS_sem_wait 101
+#define SYS_sem_post 102
+#define SYS_sem_get_value 103
+#define SYS_dyncode_create 104
+#define SYS_dyncode_modify 105
+#define SYS_dyncode_delete 106
+#define SYS_test_infoleak 109
+#define SYS_test_crash 110
+#define SYS_test_syscall_1 111
+#define SYS_test_syscall_2 112
diff --git a/src/runtime/syscall_solaris.c b/src/runtime/syscall_solaris.c
new file mode 100644
index 000000000..13ac31bde
--- /dev/null
+++ b/src/runtime/syscall_solaris.c
@@ -0,0 +1,23 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#pragma dynimport libc·chdir chdir "libc.so"
+#pragma dynimport libc·chroot chroot "libc.so"
+#pragma dynimport libc·close close "libc.so"
+#pragma dynimport libc·dlclose dlclose "libc.so"
+#pragma dynimport libc·dlopen dlopen "libc.so"
+#pragma dynimport libc·dlsym dlsym "libc.so"
+#pragma dynimport libc·execve execve "libc.so"
+#pragma dynimport libc·fcntl fcntl "libc.so"
+#pragma dynimport libc·gethostname gethostname "libc.so"
+#pragma dynimport libc·ioctl ioctl "libc.so"
+#pragma dynimport libc·pipe pipe "libc.so"
+#pragma dynimport libc·setgid setgid "libc.so"
+#pragma dynimport libc·setgroups setgroups "libc.so"
+#pragma dynimport libc·setsid setsid "libc.so"
+#pragma dynimport libc·setuid setuid "libc.so"
+#pragma dynimport libc·setpgid setsid "libc.so"
+#pragma dynimport libc·syscall syscall "libc.so"
+#pragma dynimport libc·forkx forkx "libc.so"
+#pragma dynimport libc·wait4 wait4 "libc.so"
diff --git a/src/runtime/syscall_solaris.go b/src/runtime/syscall_solaris.go
new file mode 100644
index 000000000..50d3a1d36
--- /dev/null
+++ b/src/runtime/syscall_solaris.go
@@ -0,0 +1,322 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var (
+	libc_chdir,
+	libc_chroot,
+	libc_close,
+	libc_dlopen,
+	libc_dlclose,
+	libc_dlsym,
+	libc_execve,
+	libc_exit,
+	libc_fcntl,
+	libc_forkx,
+	libc_gethostname,
+	libc_ioctl,
+	libc_pipe,
+	libc_setgid,
+	libc_setgroups,
+	libc_setsid,
+	libc_setuid,
+	libc_setpgid,
+	libc_syscall,
+	libc_wait4,
+	libc_write,
+	pipe1 libcFunc
+)
+
+//go:nosplit
+func syscall_sysvicall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	call := libcall{
+		fn:   fn,
+		n:    nargs,
+		args: uintptr(unsafe.Pointer(&a1)),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	return call.r1, call.r2, call.err
+}
+
+//go:nosplit
+func syscall_rawsysvicall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	call := libcall{
+		fn:   fn,
+		n:    nargs,
+		args: uintptr(unsafe.Pointer(&a1)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.r1, call.r2, call.err
+}
+
+// TODO(aram): Once we remove all instances of C calling sysvicallN, make
+// sysvicallN return errors and replace the body of the following functions
+// with calls to sysvicallN.
+
+//go:nosplit
+func syscall_chdir(path uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_chdir)),
+		n:    1,
+		args: uintptr(unsafe.Pointer(&path)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+//go:nosplit
+func syscall_chroot(path uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_chroot)),
+		n:    1,
+		args: uintptr(unsafe.Pointer(&path)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+// like close, but must not split stack, for forkx.
+//go:nosplit
+func syscall_close(fd int32) int32 {
+	return int32(sysvicall1(&libc_close, uintptr(fd)))
+}
+
+func syscall_dlopen(name *byte, mode uintptr) (handle uintptr, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_dlopen)),
+		n:    2,
+		args: uintptr(unsafe.Pointer(&name)),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	if call.r1 == 0 {
+		return call.r1, call.err
+	}
+	return call.r1, 0
+}
+
+func syscall_dlclose(handle uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_dlclose)),
+		n:    1,
+		args: uintptr(unsafe.Pointer(&handle)),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	return call.r1
+}
+
+func syscall_dlsym(handle uintptr, name *byte) (proc uintptr, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_dlsym)),
+		n:    2,
+		args: uintptr(unsafe.Pointer(&handle)),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	if call.r1 == 0 {
+		return call.r1, call.err
+	}
+	return call.r1, 0
+}
+
+//go:nosplit
+func syscall_execve(path, argv, envp uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_execve)),
+		n:    3,
+		args: uintptr(unsafe.Pointer(&path)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+// like exit, but must not split stack, for forkx.
+//go:nosplit
+func syscall_exit(code uintptr) {
+	sysvicall1(&libc_exit, code)
+}
+
+//go:nosplit
+func syscall_fcntl(fd, cmd, arg uintptr) (val, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_fcntl)),
+		n:    3,
+		args: uintptr(unsafe.Pointer(&fd)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.r1, call.err
+}
+
+//go:nosplit
+func syscall_forkx(flags uintptr) (pid uintptr, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_forkx)),
+		n:    1,
+		args: uintptr(unsafe.Pointer(&flags)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.r1, call.err
+}
+
+func syscall_gethostname() (name string, err uintptr) {
+	cname := new([_MAXHOSTNAMELEN]byte)
+	var args = [2]uintptr{uintptr(unsafe.Pointer(&cname[0])), _MAXHOSTNAMELEN}
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_gethostname)),
+		n:    2,
+		args: uintptr(unsafe.Pointer(&args[0])),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	if call.r1 != 0 {
+		return "", call.err
+	}
+	cname[_MAXHOSTNAMELEN-1] = 0
+	return gostringnocopy(&cname[0]), 0
+}
+
+//go:nosplit
+func syscall_ioctl(fd, req, arg uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_ioctl)),
+		n:    3,
+		args: uintptr(unsafe.Pointer(&fd)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+func syscall_pipe() (r, w, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&pipe1)),
+		n:    0,
+		args: uintptr(unsafe.Pointer(&pipe1)), // it's unused but must be non-nil, otherwise crashes
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	return call.r1, call.r2, call.err
+}
+
+// This is syscall.RawSyscall, it exists to satisfy some build dependency,
+// but it doesn't work correctly.
+//
+// DO NOT USE!
+//
+// TODO(aram): make this panic once we stop calling fcntl(2) in net using it.
+func syscall_rawsyscall(trap, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_syscall)),
+		n:    4,
+		args: uintptr(unsafe.Pointer(&trap)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.r1, call.r2, call.err
+}
+
+//go:nosplit
+func syscall_setgid(gid uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_setgid)),
+		n:    1,
+		args: uintptr(unsafe.Pointer(&gid)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+//go:nosplit
+func syscall_setgroups(ngid, gid uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_setgroups)),
+		n:    2,
+		args: uintptr(unsafe.Pointer(&ngid)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+//go:nosplit
+func syscall_setsid() (pid, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_setsid)),
+		n:    0,
+		args: uintptr(unsafe.Pointer(&libc_setsid)), // it's unused but must be non-nil, otherwise crashes
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.r1, call.err
+}
+
+//go:nosplit
+func syscall_setuid(uid uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_setuid)),
+		n:    1,
+		args: uintptr(unsafe.Pointer(&uid)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+//go:nosplit
+func syscall_setpgid(pid, pgid uintptr) (err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_setpgid)),
+		n:    2,
+		args: uintptr(unsafe.Pointer(&pid)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.err
+}
+
+// This is syscall.Syscall, it exists to satisfy some build dependency,
+// but it doesn't work correctly.
+//
+// DO NOT USE!
+//
+// TODO(aram): make this panic once we stop calling fcntl(2) in net using it.
+func syscall_syscall(trap, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_syscall)),
+		n:    4,
+		args: uintptr(unsafe.Pointer(&trap)),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	return call.r1, call.r2, call.err
+}
+
+func syscall_wait4(pid uintptr, wstatus *uint32, options uintptr, rusage unsafe.Pointer) (wpid int, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_wait4)),
+		n:    4,
+		args: uintptr(unsafe.Pointer(&pid)),
+	}
+	entersyscallblock()
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	exitsyscall()
+	return int(call.r1), call.err
+}
+
+//go:nosplit
+func syscall_write(fd, buf, nbyte uintptr) (n, err uintptr) {
+	call := libcall{
+		fn:   uintptr(unsafe.Pointer(&libc_write)),
+		n:    3,
+		args: uintptr(unsafe.Pointer(&fd)),
+	}
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&call))
+	return call.r1, call.err
+}
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
new file mode 100644
index 000000000..efbcab510
--- /dev/null
+++ b/src/runtime/syscall_windows.go
@@ -0,0 +1,170 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+type callbacks struct {
+	lock mutex
+	ctxt [cb_max]*wincallbackcontext
+	n    int
+}
+
+func (c *wincallbackcontext) isCleanstack() bool {
+	return c.cleanstack
+}
+
+func (c *wincallbackcontext) setCleanstack(cleanstack bool) {
+	c.cleanstack = cleanstack
+}
+
+var (
+	cbs     callbacks
+	cbctxts **wincallbackcontext = &cbs.ctxt[0] // to simplify access to cbs.ctxt in sys_windows_*.s
+
+	callbackasm byte // type isn't really byte, it's code in runtime
+)
+
+// callbackasmAddr returns address of runtime.callbackasm
+// function adjusted by i.
+// runtime.callbackasm is just a series of CALL instructions
+// (each is 5 bytes long), and we want callback to arrive at
+// correspondent call instruction instead of start of
+// runtime.callbackasm.
+func callbackasmAddr(i int) uintptr {
+	return uintptr(add(unsafe.Pointer(&callbackasm), uintptr(i*5)))
+}
+
+func compileCallback(fn eface, cleanstack bool) (code uintptr) {
+	if fn._type == nil || (fn._type.kind&kindMask) != kindFunc {
+		panic("compilecallback: not a function")
+	}
+	ft := (*functype)(unsafe.Pointer(fn._type))
+	if len(ft.out) != 1 {
+		panic("compilecallback: function must have one output parameter")
+	}
+	uintptrSize := unsafe.Sizeof(uintptr(0))
+	if t := (**_type)(unsafe.Pointer(&ft.out[0])); (*t).size != uintptrSize {
+		panic("compilecallback: output parameter size is wrong")
+	}
+	argsize := uintptr(0)
+	for _, t := range (*[1024](*_type))(unsafe.Pointer(&ft.in[0]))[:len(ft.in)] {
+		if (*t).size > uintptrSize {
+			panic("compilecallback: input parameter size is wrong")
+		}
+		argsize += uintptrSize
+	}
+
+	lock(&cbs.lock)
+	defer unlock(&cbs.lock)
+
+	n := cbs.n
+	for i := 0; i < n; i++ {
+		if cbs.ctxt[i].gobody == fn.data && cbs.ctxt[i].isCleanstack() == cleanstack {
+			return callbackasmAddr(i)
+		}
+	}
+	if n >= cb_max {
+		gothrow("too many callback functions")
+	}
+
+	c := new(wincallbackcontext)
+	c.gobody = fn.data
+	c.argsize = argsize
+	c.setCleanstack(cleanstack)
+	if cleanstack && argsize != 0 {
+		c.restorestack = argsize
+	} else {
+		c.restorestack = 0
+	}
+	cbs.ctxt[n] = c
+	cbs.n++
+
+	return callbackasmAddr(n)
+}
+
+func getLoadLibrary() uintptr
+
+//go:nosplit
+func syscall_loadlibrary(filename *uint16) (handle, err uintptr) {
+	var c libcall
+	c.fn = getLoadLibrary()
+	c.n = 1
+	c.args = uintptr(unsafe.Pointer(&filename))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	handle = c.r1
+	if handle == 0 {
+		err = c.err
+	}
+	return
+}
+
+func getGetProcAddress() uintptr
+
+//go:nosplit
+func syscall_getprocaddress(handle uintptr, procname *byte) (outhandle, err uintptr) {
+	var c libcall
+	c.fn = getGetProcAddress()
+	c.n = 2
+	c.args = uintptr(unsafe.Pointer(&handle))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	outhandle = c.r1
+	if outhandle == 0 {
+		err = c.err
+	}
+	return
+}
+
+//go:nosplit
+func syscall_Syscall(fn, nargs, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	var c libcall
+	c.fn = fn
+	c.n = nargs
+	c.args = uintptr(unsafe.Pointer(&a1))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	return c.r1, c.r2, c.err
+}
+
+//go:nosplit
+func syscall_Syscall6(fn, nargs, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	var c libcall
+	c.fn = fn
+	c.n = nargs
+	c.args = uintptr(unsafe.Pointer(&a1))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	return c.r1, c.r2, c.err
+}
+
+//go:nosplit
+func syscall_Syscall9(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr) {
+	var c libcall
+	c.fn = fn
+	c.n = nargs
+	c.args = uintptr(unsafe.Pointer(&a1))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	return c.r1, c.r2, c.err
+}
+
+//go:nosplit
+func syscall_Syscall12(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12 uintptr) (r1, r2, err uintptr) {
+	var c libcall
+	c.fn = fn
+	c.n = nargs
+	c.args = uintptr(unsafe.Pointer(&a1))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	return c.r1, c.r2, c.err
+}
+
+//go:nosplit
+func syscall_Syscall15(fn, nargs, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15 uintptr) (r1, r2, err uintptr) {
+	var c libcall
+	c.fn = fn
+	c.n = nargs
+	c.args = uintptr(unsafe.Pointer(&a1))
+	cgocall_errno(unsafe.Pointer(funcPC(asmstdcall)), unsafe.Pointer(&c))
+	return c.r1, c.r2, c.err
+}
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
new file mode 100644
index 000000000..ce8a9ec1b
--- /dev/null
+++ b/src/runtime/syscall_windows_test.go
@@ -0,0 +1,535 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+type DLL struct {
+	*syscall.DLL
+	t *testing.T
+}
+
+func GetDLL(t *testing.T, name string) *DLL {
+	d, e := syscall.LoadDLL(name)
+	if e != nil {
+		t.Fatal(e)
+	}
+	return &DLL{DLL: d, t: t}
+}
+
+func (d *DLL) Proc(name string) *syscall.Proc {
+	p, e := d.FindProc(name)
+	if e != nil {
+		d.t.Fatal(e)
+	}
+	return p
+}
+
+func TestStdCall(t *testing.T) {
+	type Rect struct {
+		left, top, right, bottom int32
+	}
+	res := Rect{}
+	expected := Rect{1, 1, 40, 60}
+	a, _, _ := GetDLL(t, "user32.dll").Proc("UnionRect").Call(
+		uintptr(unsafe.Pointer(&res)),
+		uintptr(unsafe.Pointer(&Rect{10, 1, 14, 60})),
+		uintptr(unsafe.Pointer(&Rect{1, 2, 40, 50})))
+	if a != 1 || res.left != expected.left ||
+		res.top != expected.top ||
+		res.right != expected.right ||
+		res.bottom != expected.bottom {
+		t.Error("stdcall USER32.UnionRect returns", a, "res=", res)
+	}
+}
+
+func Test64BitReturnStdCall(t *testing.T) {
+
+	const (
+		VER_BUILDNUMBER      = 0x0000004
+		VER_MAJORVERSION     = 0x0000002
+		VER_MINORVERSION     = 0x0000001
+		VER_PLATFORMID       = 0x0000008
+		VER_PRODUCT_TYPE     = 0x0000080
+		VER_SERVICEPACKMAJOR = 0x0000020
+		VER_SERVICEPACKMINOR = 0x0000010
+		VER_SUITENAME        = 0x0000040
+
+		VER_EQUAL         = 1
+		VER_GREATER       = 2
+		VER_GREATER_EQUAL = 3
+		VER_LESS          = 4
+		VER_LESS_EQUAL    = 5
+
+		ERROR_OLD_WIN_VERSION syscall.Errno = 1150
+	)
+
+	type OSVersionInfoEx struct {
+		OSVersionInfoSize uint32
+		MajorVersion      uint32
+		MinorVersion      uint32
+		BuildNumber       uint32
+		PlatformId        uint32
+		CSDVersion        [128]uint16
+		ServicePackMajor  uint16
+		ServicePackMinor  uint16
+		SuiteMask         uint16
+		ProductType       byte
+		Reserve           byte
+	}
+
+	d := GetDLL(t, "kernel32.dll")
+
+	var m1, m2 uintptr
+	VerSetConditionMask := d.Proc("VerSetConditionMask")
+	m1, m2, _ = VerSetConditionMask.Call(m1, m2, VER_MAJORVERSION, VER_GREATER_EQUAL)
+	m1, m2, _ = VerSetConditionMask.Call(m1, m2, VER_MINORVERSION, VER_GREATER_EQUAL)
+	m1, m2, _ = VerSetConditionMask.Call(m1, m2, VER_SERVICEPACKMAJOR, VER_GREATER_EQUAL)
+	m1, m2, _ = VerSetConditionMask.Call(m1, m2, VER_SERVICEPACKMINOR, VER_GREATER_EQUAL)
+
+	vi := OSVersionInfoEx{
+		MajorVersion:     5,
+		MinorVersion:     1,
+		ServicePackMajor: 2,
+		ServicePackMinor: 0,
+	}
+	vi.OSVersionInfoSize = uint32(unsafe.Sizeof(vi))
+	r, _, e2 := d.Proc("VerifyVersionInfoW").Call(
+		uintptr(unsafe.Pointer(&vi)),
+		VER_MAJORVERSION|VER_MINORVERSION|VER_SERVICEPACKMAJOR|VER_SERVICEPACKMINOR,
+		m1, m2)
+	if r == 0 && e2 != ERROR_OLD_WIN_VERSION {
+		t.Errorf("VerifyVersionInfo failed: %s", e2)
+	}
+}
+
+func TestCDecl(t *testing.T) {
+	var buf [50]byte
+	fmtp, _ := syscall.BytePtrFromString("%d %d %d")
+	a, _, _ := GetDLL(t, "user32.dll").Proc("wsprintfA").Call(
+		uintptr(unsafe.Pointer(&buf[0])),
+		uintptr(unsafe.Pointer(fmtp)),
+		1000, 2000, 3000)
+	if string(buf[:a]) != "1000 2000 3000" {
+		t.Error("cdecl USER32.wsprintfA returns", a, "buf=", buf[:a])
+	}
+}
+
+func TestEnumWindows(t *testing.T) {
+	d := GetDLL(t, "user32.dll")
+	isWindows := d.Proc("IsWindow")
+	counter := 0
+	cb := syscall.NewCallback(func(hwnd syscall.Handle, lparam uintptr) uintptr {
+		if lparam != 888 {
+			t.Error("lparam was not passed to callback")
+		}
+		b, _, _ := isWindows.Call(uintptr(hwnd))
+		if b == 0 {
+			t.Error("USER32.IsWindow returns FALSE")
+		}
+		counter++
+		return 1 // continue enumeration
+	})
+	a, _, _ := d.Proc("EnumWindows").Call(cb, 888)
+	if a == 0 {
+		t.Error("USER32.EnumWindows returns FALSE")
+	}
+	if counter == 0 {
+		t.Error("Callback has been never called or your have no windows")
+	}
+}
+
+func callback(hwnd syscall.Handle, lparam uintptr) uintptr {
+	(*(*func())(unsafe.Pointer(&lparam)))()
+	return 0 // stop enumeration
+}
+
+// nestedCall calls into Windows, back into Go, and finally to f.
+func nestedCall(t *testing.T, f func()) {
+	c := syscall.NewCallback(callback)
+	d := GetDLL(t, "user32.dll")
+	defer d.Release()
+	d.Proc("EnumWindows").Call(c, uintptr(*(*unsafe.Pointer)(unsafe.Pointer(&f))))
+}
+
+func TestCallback(t *testing.T) {
+	var x = false
+	nestedCall(t, func() { x = true })
+	if !x {
+		t.Fatal("nestedCall did not call func")
+	}
+}
+
+func TestCallbackGC(t *testing.T) {
+	nestedCall(t, runtime.GC)
+}
+
+func TestCallbackPanicLocked(t *testing.T) {
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	if !runtime.LockedOSThread() {
+		t.Fatal("runtime.LockOSThread didn't")
+	}
+	defer func() {
+		s := recover()
+		if s == nil {
+			t.Fatal("did not panic")
+		}
+		if s.(string) != "callback panic" {
+			t.Fatal("wrong panic:", s)
+		}
+		if !runtime.LockedOSThread() {
+			t.Fatal("lost lock on OS thread after panic")
+		}
+	}()
+	nestedCall(t, func() { panic("callback panic") })
+	panic("nestedCall returned")
+}
+
+func TestCallbackPanic(t *testing.T) {
+	// Make sure panic during callback unwinds properly.
+	if runtime.LockedOSThread() {
+		t.Fatal("locked OS thread on entry to TestCallbackPanic")
+	}
+	defer func() {
+		s := recover()
+		if s == nil {
+			t.Fatal("did not panic")
+		}
+		if s.(string) != "callback panic" {
+			t.Fatal("wrong panic:", s)
+		}
+		if runtime.LockedOSThread() {
+			t.Fatal("locked OS thread on exit from TestCallbackPanic")
+		}
+	}()
+	nestedCall(t, func() { panic("callback panic") })
+	panic("nestedCall returned")
+}
+
+func TestCallbackPanicLoop(t *testing.T) {
+	// Make sure we don't blow out m->g0 stack.
+	for i := 0; i < 100000; i++ {
+		TestCallbackPanic(t)
+	}
+}
+
+func TestBlockingCallback(t *testing.T) {
+	c := make(chan int)
+	go func() {
+		for i := 0; i < 10; i++ {
+			c <- <-c
+		}
+	}()
+	nestedCall(t, func() {
+		for i := 0; i < 10; i++ {
+			c <- i
+			if j := <-c; j != i {
+				t.Errorf("out of sync %d != %d", j, i)
+			}
+		}
+	})
+}
+
+func TestCallbackInAnotherThread(t *testing.T) {
+	// TODO: test a function which calls back in another thread: QueueUserAPC() or CreateThread()
+}
+
+type cbDLLFunc int // int determines number of callback parameters
+
+func (f cbDLLFunc) stdcallName() string {
+	return fmt.Sprintf("stdcall%d", f)
+}
+
+func (f cbDLLFunc) cdeclName() string {
+	return fmt.Sprintf("cdecl%d", f)
+}
+
+func (f cbDLLFunc) buildOne(stdcall bool) string {
+	var funcname, attr string
+	if stdcall {
+		funcname = f.stdcallName()
+		attr = "__stdcall"
+	} else {
+		funcname = f.cdeclName()
+		attr = "__cdecl"
+	}
+	typename := "t" + funcname
+	p := make([]string, f)
+	for i := range p {
+		p[i] = "void*"
+	}
+	params := strings.Join(p, ",")
+	for i := range p {
+		p[i] = fmt.Sprintf("%d", i+1)
+	}
+	args := strings.Join(p, ",")
+	return fmt.Sprintf(`
+typedef void %s (*%s)(%s);
+void %s(%s f, void *n) {
+	int i;
+	for(i=0;i<(int)n;i++){
+		f(%s);
+	}
+}
+	`, attr, typename, params, funcname, typename, args)
+}
+
+func (f cbDLLFunc) build() string {
+	return f.buildOne(false) + f.buildOne(true)
+}
+
+var cbFuncs = [...]interface{}{
+	2: func(i1, i2 uintptr) uintptr {
+		if i1+i2 != 3 {
+			panic("bad input")
+		}
+		return 0
+	},
+	3: func(i1, i2, i3 uintptr) uintptr {
+		if i1+i2+i3 != 6 {
+			panic("bad input")
+		}
+		return 0
+	},
+	4: func(i1, i2, i3, i4 uintptr) uintptr {
+		if i1+i2+i3+i4 != 10 {
+			panic("bad input")
+		}
+		return 0
+	},
+	5: func(i1, i2, i3, i4, i5 uintptr) uintptr {
+		if i1+i2+i3+i4+i5 != 15 {
+			panic("bad input")
+		}
+		return 0
+	},
+	6: func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
+		if i1+i2+i3+i4+i5+i6 != 21 {
+			panic("bad input")
+		}
+		return 0
+	},
+	7: func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
+		if i1+i2+i3+i4+i5+i6+i7 != 28 {
+			panic("bad input")
+		}
+		return 0
+	},
+	8: func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
+		if i1+i2+i3+i4+i5+i6+i7+i8 != 36 {
+			panic("bad input")
+		}
+		return 0
+	},
+	9: func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
+		if i1+i2+i3+i4+i5+i6+i7+i8+i9 != 45 {
+			panic("bad input")
+		}
+		return 0
+	},
+}
+
+type cbDLL struct {
+	name      string
+	buildArgs func(out, src string) []string
+}
+
+func (d *cbDLL) buildSrc(t *testing.T, path string) {
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatalf("failed to create source file: %v", err)
+	}
+	defer f.Close()
+
+	for i := 2; i < 10; i++ {
+		fmt.Fprint(f, cbDLLFunc(i).build())
+	}
+}
+
+func (d *cbDLL) build(t *testing.T, dir string) string {
+	srcname := d.name + ".c"
+	d.buildSrc(t, filepath.Join(dir, srcname))
+	outname := d.name + ".dll"
+	args := d.buildArgs(outname, srcname)
+	cmd := exec.Command(args[0], args[1:]...)
+	cmd.Dir = dir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build dll: %v - %v", err, string(out))
+	}
+	return filepath.Join(dir, outname)
+}
+
+var cbDLLs = []cbDLL{
+	{
+		"test",
+		func(out, src string) []string {
+			return []string{"gcc", "-shared", "-s", "-o", out, src}
+		},
+	},
+	{
+		"testO2",
+		func(out, src string) []string {
+			return []string{"gcc", "-shared", "-s", "-o", out, "-O2", src}
+		},
+	},
+}
+
+type cbTest struct {
+	n     int     // number of callback parameters
+	param uintptr // dll function parameter
+}
+
+func (test *cbTest) run(t *testing.T, dllpath string) {
+	dll := syscall.MustLoadDLL(dllpath)
+	defer dll.Release()
+	cb := cbFuncs[test.n]
+	stdcall := syscall.NewCallback(cb)
+	f := cbDLLFunc(test.n)
+	test.runOne(t, dll, f.stdcallName(), stdcall)
+	cdecl := syscall.NewCallbackCDecl(cb)
+	test.runOne(t, dll, f.cdeclName(), cdecl)
+}
+
+func (test *cbTest) runOne(t *testing.T, dll *syscall.DLL, proc string, cb uintptr) {
+	defer func() {
+		if r := recover(); r != nil {
+			t.Errorf("dll call %v(..., %d) failed: %v", proc, test.param, r)
+		}
+	}()
+	dll.MustFindProc(proc).Call(cb, test.param)
+}
+
+var cbTests = []cbTest{
+	{2, 1},
+	{2, 10000},
+	{3, 3},
+	{4, 5},
+	{4, 6},
+	{5, 2},
+	{6, 7},
+	{6, 8},
+	{7, 6},
+	{8, 1},
+	{9, 8},
+	{9, 10000},
+	{3, 4},
+	{5, 3},
+	{7, 7},
+	{8, 2},
+	{9, 9},
+}
+
+func TestStdcallAndCDeclCallbacks(t *testing.T) {
+	tmp, err := ioutil.TempDir("", "TestCDeclCallback")
+	if err != nil {
+		t.Fatal("TempDir failed: ", err)
+	}
+	defer os.RemoveAll(tmp)
+
+	for _, dll := range cbDLLs {
+		dllPath := dll.build(t, tmp)
+		for _, test := range cbTests {
+			test.run(t, dllPath)
+		}
+	}
+}
+
+func TestRegisterClass(t *testing.T) {
+	kernel32 := GetDLL(t, "kernel32.dll")
+	user32 := GetDLL(t, "user32.dll")
+	mh, _, _ := kernel32.Proc("GetModuleHandleW").Call(0)
+	cb := syscall.NewCallback(func(hwnd syscall.Handle, msg uint32, wparam, lparam uintptr) (rc uintptr) {
+		t.Fatal("callback should never get called")
+		return 0
+	})
+	type Wndclassex struct {
+		Size       uint32
+		Style      uint32
+		WndProc    uintptr
+		ClsExtra   int32
+		WndExtra   int32
+		Instance   syscall.Handle
+		Icon       syscall.Handle
+		Cursor     syscall.Handle
+		Background syscall.Handle
+		MenuName   *uint16
+		ClassName  *uint16
+		IconSm     syscall.Handle
+	}
+	name := syscall.StringToUTF16Ptr("test_window")
+	wc := Wndclassex{
+		WndProc:   cb,
+		Instance:  syscall.Handle(mh),
+		ClassName: name,
+	}
+	wc.Size = uint32(unsafe.Sizeof(wc))
+	a, _, err := user32.Proc("RegisterClassExW").Call(uintptr(unsafe.Pointer(&wc)))
+	if a == 0 {
+		t.Fatalf("RegisterClassEx failed: %v", err)
+	}
+	r, _, err := user32.Proc("UnregisterClassW").Call(uintptr(unsafe.Pointer(name)), 0)
+	if r == 0 {
+		t.Fatalf("UnregisterClass failed: %v", err)
+	}
+}
+
+func TestOutputDebugString(t *testing.T) {
+	d := GetDLL(t, "kernel32.dll")
+	p := syscall.StringToUTF16Ptr("testing OutputDebugString")
+	d.Proc("OutputDebugStringW").Call(uintptr(unsafe.Pointer(p)))
+}
+
+func TestRaiseException(t *testing.T) {
+	o := executeTest(t, raiseExceptionSource, nil)
+	if strings.Contains(o, "RaiseException should not return") {
+		t.Fatalf("RaiseException did not crash program: %v", o)
+	}
+	if !strings.Contains(o, "Exception 0xbad") {
+		t.Fatalf("No stack trace: %v", o)
+	}
+}
+
+const raiseExceptionSource = `
+package main
+import "syscall"
+func main() {
+	const EXCEPTION_NONCONTINUABLE = 1
+	mod := syscall.MustLoadDLL("kernel32.dll")
+	proc := mod.MustFindProc("RaiseException")
+	proc.Call(0xbad, EXCEPTION_NONCONTINUABLE, 0, 0)
+	println("RaiseException should not return")
+}
+`
+
+func TestZeroDivisionException(t *testing.T) {
+	o := executeTest(t, zeroDivisionExceptionSource, nil)
+	if !strings.Contains(o, "panic: runtime error: integer divide by zero") {
+		t.Fatalf("No stack trace: %v", o)
+	}
+}
+
+const zeroDivisionExceptionSource = `
+package main
+func main() {
+	x := 1
+	y := 0
+	z := x / y
+	println(z)
+}
+`
diff --git a/src/runtime/thunk.s b/src/runtime/thunk.s
new file mode 100644
index 000000000..0a0f147c4
--- /dev/null
+++ b/src/runtime/thunk.s
@@ -0,0 +1,183 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file exposes various internal runtime functions to other packages in std lib.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+#ifdef GOARCH_arm
+#define JMP B
+#endif
+
+TEXT net·runtimeNano(SB),NOSPLIT,$0-0
+	JMP	runtime·nanotime(SB)
+
+TEXT time·runtimeNano(SB),NOSPLIT,$0-0
+	JMP     runtime·nanotime(SB)
+
+TEXT time·Sleep(SB),NOSPLIT,$0-0
+	JMP     runtime·timeSleep(SB)
+
+TEXT time·startTimer(SB),NOSPLIT,$0-0
+	JMP     runtime·startTimer(SB)
+
+TEXT time·stopTimer(SB),NOSPLIT,$0-0
+	JMP     runtime·stopTimer(SB)
+
+TEXT sync·runtime_Syncsemacquire(SB),NOSPLIT,$0-0
+	JMP	runtime·syncsemacquire(SB)
+
+TEXT sync·runtime_Syncsemrelease(SB),NOSPLIT,$0-0
+	JMP	runtime·syncsemrelease(SB)
+
+TEXT sync·runtime_Syncsemcheck(SB),NOSPLIT,$0-0
+	JMP	runtime·syncsemcheck(SB)
+
+TEXT sync·runtime_Semacquire(SB),NOSPLIT,$0-0
+	JMP	runtime·asyncsemacquire(SB)
+
+TEXT sync·runtime_Semrelease(SB),NOSPLIT,$0-0
+	JMP	runtime·asyncsemrelease(SB)
+
+TEXT sync·runtime_registerPoolCleanup(SB),NOSPLIT,$0-0
+	JMP	runtime·registerPoolCleanup(SB)
+
+TEXT net·runtime_Semacquire(SB),NOSPLIT,$0-0
+	JMP	runtime·asyncsemacquire(SB)
+
+TEXT net·runtime_Semrelease(SB),NOSPLIT,$0-0
+	JMP	runtime·asyncsemrelease(SB)
+
+TEXT runtime∕pprof·runtime_cyclesPerSecond(SB),NOSPLIT,$0-0
+	JMP	runtime·tickspersecond(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT,$0-0
+	JMP	runtime·cmpbytes(SB)
+
+TEXT reflect·call(SB), NOSPLIT, $0-0
+	JMP	runtime·reflectcall(SB)
+
+TEXT reflect·chanclose(SB), NOSPLIT, $0-0
+	JMP	runtime·closechan(SB)
+
+TEXT reflect·chanlen(SB), NOSPLIT, $0-0
+	JMP	runtime·reflect_chanlen(SB)
+
+TEXT reflect·chancap(SB), NOSPLIT, $0-0
+	JMP	runtime·reflect_chancap(SB)
+
+TEXT reflect·chansend(SB), NOSPLIT, $0-0
+	JMP	runtime·reflect_chansend(SB)
+
+TEXT reflect·chanrecv(SB), NOSPLIT, $0-0
+	JMP	runtime·reflect_chanrecv(SB)
+
+TEXT reflect·memmove(SB), NOSPLIT, $0-0
+	JMP	runtime·memmove(SB)
+
+TEXT runtime∕debug·freeOSMemory(SB), NOSPLIT, $0-0
+	JMP	runtime·freeOSMemory(SB)
+
+TEXT runtime∕debug·WriteHeapDump(SB), NOSPLIT, $0-0
+	JMP	runtime·writeHeapDump(SB)
+
+TEXT net·runtime_pollServerInit(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollServerInit(SB)
+
+TEXT net·runtime_pollOpen(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollOpen(SB)
+
+TEXT net·runtime_pollClose(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollClose(SB)
+
+TEXT net·runtime_pollReset(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollReset(SB)
+
+TEXT net·runtime_pollWait(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollWait(SB)
+
+TEXT net·runtime_pollWaitCanceled(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollWaitCanceled(SB)
+
+TEXT net·runtime_pollSetDeadline(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollSetDeadline(SB)
+
+TEXT net·runtime_pollUnblock(SB),NOSPLIT,$0-0
+	JMP	runtime·netpollUnblock(SB)
+
+TEXT syscall·setenv_c(SB), NOSPLIT, $0-0
+	JMP	runtime·syscall_setenv_c(SB)
+
+TEXT syscall·unsetenv_c(SB), NOSPLIT, $0-0
+	JMP	runtime·syscall_unsetenv_c(SB)
+
+TEXT reflect·makemap(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_makemap(SB)
+
+TEXT reflect·mapaccess(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_mapaccess(SB)
+
+TEXT reflect·mapassign(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_mapassign(SB)
+
+TEXT reflect·mapdelete(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_mapdelete(SB)
+
+TEXT reflect·mapiterinit(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_mapiterinit(SB)
+
+TEXT reflect·mapiterkey(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_mapiterkey(SB)
+
+TEXT reflect·mapiternext(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_mapiternext(SB)
+
+TEXT reflect·maplen(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_maplen(SB)
+
+TEXT reflect·ismapkey(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_ismapkey(SB)
+
+TEXT reflect·ifaceE2I(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_ifaceE2I(SB)
+
+TEXT reflect·unsafe_New(SB),NOSPLIT,$0-0
+	JMP	runtime·newobject(SB)
+
+TEXT reflect·unsafe_NewArray(SB),NOSPLIT,$0-0
+	JMP	runtime·newarray(SB)
+
+TEXT reflect·makechan(SB),NOSPLIT,$0-0
+	JMP	runtime·makechan(SB)
+
+TEXT reflect·rselect(SB),NOSPLIT,$0-0
+	JMP	runtime·reflect_rselect(SB)
+
+TEXT os·sigpipe(SB),NOSPLIT,$0-0
+	JMP	runtime·os_sigpipe(SB)
+
+TEXT runtime·runtime_init(SB),NOSPLIT,$0-0
+	JMP	runtime·init(SB)
+
+TEXT runtime·main_init(SB),NOSPLIT,$0-0
+	JMP	main·init(SB)
+
+TEXT runtime·main_main(SB),NOSPLIT,$0-0
+	JMP	main·main(SB)
+
+TEXT runtime·timenow(SB),NOSPLIT,$0-0
+	JMP	time·now(SB)
+
+TEXT sync∕atomic·runtime_procPin(SB),NOSPLIT,$0-0
+	JMP     sync·runtime_procPin(SB)
+
+TEXT sync∕atomic·runtime_procUnpin(SB),NOSPLIT,$0-0
+	JMP     sync·runtime_procUnpin(SB)
+
+TEXT syscall·runtime_envs(SB),NOSPLIT,$0-0
+	JMP	runtime·runtime_envs(SB)
+
+TEXT os·runtime_args(SB),NOSPLIT,$0-0
+	JMP	runtime·runtime_args(SB)
diff --git a/src/runtime/thunk_solaris_amd64.s b/src/runtime/thunk_solaris_amd64.s
new file mode 100644
index 000000000..f61188c14
--- /dev/null
+++ b/src/runtime/thunk_solaris_amd64.s
@@ -0,0 +1,88 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file exposes various external library functions to Go code in the runtime.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+TEXT runtime·libc_chdir(SB),NOSPLIT,$0
+	MOVQ	libc·chdir(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_chroot(SB),NOSPLIT,$0
+	MOVQ	libc·chroot(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_close(SB),NOSPLIT,$0
+	MOVQ	libc·close(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_dlopen(SB),NOSPLIT,$0
+	MOVQ	libc·dlopen(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_dlclose(SB),NOSPLIT,$0
+	MOVQ	libc·dlclose(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_dlsym(SB),NOSPLIT,$0
+	MOVQ	libc·dlsym(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_execve(SB),NOSPLIT,$0
+	MOVQ	libc·execve(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_exit(SB),NOSPLIT,$0
+	MOVQ	libc·exit(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_fcntl(SB),NOSPLIT,$0
+	MOVQ	libc·fcntl(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_forkx(SB),NOSPLIT,$0
+	MOVQ	libc·forkx(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_gethostname(SB),NOSPLIT,$0
+	MOVQ	libc·gethostname(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_ioctl(SB),NOSPLIT,$0
+	MOVQ	libc·ioctl(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_setgid(SB),NOSPLIT,$0
+	MOVQ	libc·setgid(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_setgroups(SB),NOSPLIT,$0
+	MOVQ	libc·setgroups(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_setsid(SB),NOSPLIT,$0
+	MOVQ	libc·setsid(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_setuid(SB),NOSPLIT,$0
+	MOVQ	libc·setuid(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_setpgid(SB),NOSPLIT,$0
+	MOVQ	libc·setpgid(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_syscall(SB),NOSPLIT,$0
+	MOVQ	libc·syscall(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_wait4(SB),NOSPLIT,$0
+	MOVQ	libc·wait4(SB), AX
+	JMP	AX
+
+TEXT runtime·libc_write(SB),NOSPLIT,$0
+	MOVQ	libc·write(SB), AX
+	JMP	AX
diff --git a/src/runtime/thunk_windows.s b/src/runtime/thunk_windows.s
new file mode 100644
index 000000000..7ccb98fd4
--- /dev/null
+++ b/src/runtime/thunk_windows.s
@@ -0,0 +1,30 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+TEXT syscall·Syscall(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_Syscall(SB)
+
+TEXT syscall·Syscall6(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_Syscall6(SB)
+
+TEXT syscall·Syscall9(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_Syscall9(SB)
+
+TEXT syscall·Syscall12(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_Syscall12(SB)
+
+TEXT syscall·Syscall15(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_Syscall15(SB)
+
+TEXT syscall·loadlibrary(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_loadlibrary(SB)
+
+TEXT syscall·getprocaddress(SB),NOSPLIT,$0-0
+	JMP	runtime·syscall_getprocaddress(SB)
+
+TEXT syscall·compileCallback(SB),NOSPLIT,$0
+	JMP	runtime·compileCallback(SB)
diff --git a/src/runtime/time.go b/src/runtime/time.go
new file mode 100644
index 000000000..11862c7e2
--- /dev/null
+++ b/src/runtime/time.go
@@ -0,0 +1,289 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Time-related runtime and pieces of package time.
+
+package runtime
+
+import "unsafe"
+
+// Package time knows the layout of this structure.
+// If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
+// For GOOS=nacl, package syscall knows the layout of this structure.
+// If this struct changes, adjust ../syscall/net_nacl.go:/runtimeTimer.
+type timer struct {
+	i int // heap index
+
+	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
+	// each time calling f(now, arg) in the timer goroutine, so f must be
+	// a well-behaved function and not block.
+	when   int64
+	period int64
+	f      func(interface{}, uintptr)
+	arg    interface{}
+	seq    uintptr
+}
+
+var timers struct {
+	lock         mutex
+	gp           *g
+	created      bool
+	sleeping     bool
+	rescheduling bool
+	waitnote     note
+	t            []*timer
+}
+
+// nacl fake time support - time in nanoseconds since 1970
+var faketime int64
+
+// Package time APIs.
+// Godoc uses the comments in package time, not these.
+
+// time.now is implemented in assembly.
+
+// Sleep puts the current goroutine to sleep for at least ns nanoseconds.
+func timeSleep(ns int64) {
+	if ns <= 0 {
+		return
+	}
+
+	t := new(timer)
+	t.when = nanotime() + ns
+	t.f = goroutineReady
+	t.arg = getg()
+	lock(&timers.lock)
+	addtimerLocked(t)
+	goparkunlock(&timers.lock, "sleep")
+}
+
+// startTimer adds t to the timer heap.
+func startTimer(t *timer) {
+	if raceenabled {
+		racerelease(unsafe.Pointer(t))
+	}
+	addtimer(t)
+}
+
+// stopTimer removes t from the timer heap if it is there.
+// It returns true if t was removed, false if t wasn't even there.
+func stopTimer(t *timer) bool {
+	return deltimer(t)
+}
+
+// Go runtime.
+
+// Ready the goroutine arg.
+func goroutineReady(arg interface{}, seq uintptr) {
+	goready(arg.(*g))
+}
+
+func addtimer(t *timer) {
+	lock(&timers.lock)
+	addtimerLocked(t)
+	unlock(&timers.lock)
+}
+
+// Add a timer to the heap and start or kick the timer proc.
+// If the new timer is earlier than any of the others.
+// Timers are locked.
+func addtimerLocked(t *timer) {
+	// when must never be negative; otherwise timerproc will overflow
+	// during its delta calculation and never expire other runtime·timers.
+	if t.when < 0 {
+		t.when = 1<<63 - 1
+	}
+	t.i = len(timers.t)
+	timers.t = append(timers.t, t)
+	siftupTimer(t.i)
+	if t.i == 0 {
+		// siftup moved to top: new earliest deadline.
+		if timers.sleeping {
+			timers.sleeping = false
+			notewakeup(&timers.waitnote)
+		}
+		if timers.rescheduling {
+			timers.rescheduling = false
+			goready(timers.gp)
+		}
+	}
+	if !timers.created {
+		timers.created = true
+		go timerproc()
+	}
+}
+
+// Delete timer t from the heap.
+// Do not need to update the timerproc: if it wakes up early, no big deal.
+func deltimer(t *timer) bool {
+	// Dereference t so that any panic happens before the lock is held.
+	// Discard result, because t might be moving in the heap.
+	_ = t.i
+
+	lock(&timers.lock)
+	// t may not be registered anymore and may have
+	// a bogus i (typically 0, if generated by Go).
+	// Verify it before proceeding.
+	i := t.i
+	last := len(timers.t) - 1
+	if i < 0 || i > last || timers.t[i] != t {
+		unlock(&timers.lock)
+		return false
+	}
+	if i != last {
+		timers.t[i] = timers.t[last]
+		timers.t[i].i = i
+	}
+	timers.t[last] = nil
+	timers.t = timers.t[:last]
+	if i != last {
+		siftupTimer(i)
+		siftdownTimer(i)
+	}
+	unlock(&timers.lock)
+	return true
+}
+
+// Timerproc runs the time-driven events.
+// It sleeps until the next event in the timers heap.
+// If addtimer inserts a new earlier event, addtimer1 wakes timerproc early.
+func timerproc() {
+	timers.gp = getg()
+	timers.gp.issystem = true
+	for {
+		lock(&timers.lock)
+		timers.sleeping = false
+		now := nanotime()
+		delta := int64(-1)
+		for {
+			if len(timers.t) == 0 {
+				delta = -1
+				break
+			}
+			t := timers.t[0]
+			delta = t.when - now
+			if delta > 0 {
+				break
+			}
+			if t.period > 0 {
+				// leave in heap but adjust next time to fire
+				t.when += t.period * (1 + -delta/t.period)
+				siftdownTimer(0)
+			} else {
+				// remove from heap
+				last := len(timers.t) - 1
+				if last > 0 {
+					timers.t[0] = timers.t[last]
+					timers.t[0].i = 0
+				}
+				timers.t[last] = nil
+				timers.t = timers.t[:last]
+				if last > 0 {
+					siftdownTimer(0)
+				}
+				t.i = -1 // mark as removed
+			}
+			f := t.f
+			arg := t.arg
+			seq := t.seq
+			unlock(&timers.lock)
+			if raceenabled {
+				raceacquire(unsafe.Pointer(t))
+			}
+			f(arg, seq)
+			lock(&timers.lock)
+		}
+		if delta < 0 || faketime > 0 {
+			// No timers left - put goroutine to sleep.
+			timers.rescheduling = true
+			goparkunlock(&timers.lock, "timer goroutine (idle)")
+			continue
+		}
+		// At least one timer pending.  Sleep until then.
+		timers.sleeping = true
+		noteclear(&timers.waitnote)
+		unlock(&timers.lock)
+		notetsleepg(&timers.waitnote, delta)
+	}
+}
+
+func timejump() *g {
+	if faketime == 0 {
+		return nil
+	}
+
+	lock(&timers.lock)
+	if !timers.created || len(timers.t) == 0 {
+		unlock(&timers.lock)
+		return nil
+	}
+
+	var gp *g
+	if faketime < timers.t[0].when {
+		faketime = timers.t[0].when
+		if timers.rescheduling {
+			timers.rescheduling = false
+			gp = timers.gp
+		}
+	}
+	unlock(&timers.lock)
+	return gp
+}
+
+// Heap maintenance algorithms.
+
+func siftupTimer(i int) {
+	t := timers.t
+	when := t[i].when
+	tmp := t[i]
+	for i > 0 {
+		p := (i - 1) / 4 // parent
+		if when >= t[p].when {
+			break
+		}
+		t[i] = t[p]
+		t[i].i = i
+		t[p] = tmp
+		t[p].i = p
+		i = p
+	}
+}
+
+func siftdownTimer(i int) {
+	t := timers.t
+	n := len(t)
+	when := t[i].when
+	tmp := t[i]
+	for {
+		c := i*4 + 1 // left child
+		c3 := c + 2  // mid child
+		if c >= n {
+			break
+		}
+		w := t[c].when
+		if c+1 < n && t[c+1].when < w {
+			w = t[c+1].when
+			c++
+		}
+		if c3 < n {
+			w3 := t[c3].when
+			if c3+1 < n && t[c3+1].when < w3 {
+				w3 = t[c3+1].when
+				c3++
+			}
+			if w3 < w {
+				w = w3
+				c = c3
+			}
+		}
+		if w >= when {
+			break
+		}
+		t[i] = t[c]
+		t[i].i = i
+		t[c] = tmp
+		t[c].i = c
+		i = c
+	}
+}
diff --git a/src/runtime/tls_arm.s b/src/runtime/tls_arm.s
new file mode 100644
index 000000000..85c3940bf
--- /dev/null
+++ b/src/runtime/tls_arm.s
@@ -0,0 +1,69 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// We have to resort to TLS variable to save g(R10).
+// One reason is that external code might trigger
+// SIGSEGV, and our runtime.sigtramp don't even know we
+// are in external code, and will continue to use R10,
+// this might as well result in another SIGSEGV.
+// Note: both functions will clobber R0 and R11 and
+// can be called from 5c ABI code.
+
+// On android, runtime.tlsg is a normal variable.
+// TLS offset is computed in x_cgo_inittls.
+
+// save_g saves the g register into pthread-provided
+// thread-local memory, so that we can call externally compiled
+// ARM code that will overwrite those registers.
+// NOTE: runtime.gogo assumes that R1 is preserved by this function.
+//       runtime.mcall assumes this function only clobbers R0 and R11.
+// Returns with g in R0.
+TEXT runtime·save_g(SB),NOSPLIT,$-4
+#ifdef GOOS_nacl
+	// nothing to do as nacl/arm does not use TLS at all.
+	MOVW	g, R0 // preserve R0 across call to setg<>
+	RET
+#endif
+	// If the host does not support MRC the linker will replace it with
+	// a call to runtime.read_tls_fallback which jumps to __kuser_get_tls.
+	// The replacement function saves LR in R11 over the call to read_tls_fallback.
+	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
+	// $runtime.tlsg(SB) is a special linker symbol.
+	// It is the offset from the TLS base pointer to our
+	// thread-local storage for g.
+#ifdef GOOS_android
+	MOVW	runtime·tlsg(SB), R11
+#else
+	MOVW	$runtime·tlsg(SB), R11
+#endif
+	ADD	R11, R0
+	MOVW	g, 0(R0)
+	MOVW	g, R0 // preserve R0 across call to setg<>
+	RET
+
+// load_g loads the g register from pthread-provided
+// thread-local memory, for use after calling externally compiled
+// ARM code that overwrote those registers.
+TEXT runtime·load_g(SB),NOSPLIT,$0
+#ifdef GOOS_nacl
+	// nothing to do as nacl/arm does not use TLS at all.
+	RET
+#endif
+	// See save_g
+	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
+	// $runtime.tlsg(SB) is a special linker symbol.
+	// It is the offset from the TLS base pointer to our
+	// thread-local storage for g.
+#ifdef GOOS_android
+	MOVW	runtime·tlsg(SB), R11
+#else
+	MOVW	$runtime·tlsg(SB), R11
+#endif
+	ADD	R11, R0
+	MOVW	0(R0), g
+	RET
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
new file mode 100644
index 000000000..1c6ce6e64
--- /dev/null
+++ b/src/runtime/traceback.go
@@ -0,0 +1,659 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// The code in this file implements stack trace walking for all architectures.
+// The most important fact about a given architecture is whether it uses a link register.
+// On systems with link registers, the prologue for a non-leaf function stores the
+// incoming value of LR at the bottom of the newly allocated stack frame.
+// On systems without link registers, the architecture pushes a return PC during
+// the call instruction, so the return PC ends up above the stack frame.
+// In this file, the return PC is always called LR, no matter how it was found.
+//
+// To date, the opposite of a link register architecture is an x86 architecture.
+// This code may need to change if some other kind of non-link-register
+// architecture comes along.
+//
+// The other important fact is the size of a pointer: on 32-bit systems the LR
+// takes up only 4 bytes on the stack, while on 64-bit systems it takes up 8 bytes.
+// Typically this is ptrSize.
+//
+// As an exception, amd64p32 has ptrSize == 4 but the CALL instruction still
+// stores an 8-byte return PC onto the stack. To accommodate this, we use regSize
+// as the size of the architecture-pushed return PC.
+//
+// usesLR is defined below. ptrSize and regSize are defined in stubs.go.
+
+const usesLR = GOARCH != "amd64" && GOARCH != "amd64p32" && GOARCH != "386"
+
+var (
+	// initialized in tracebackinit
+	deferprocPC uintptr
+	goexitPC    uintptr
+	jmpdeferPC  uintptr
+	mcallPC     uintptr
+	morestackPC uintptr
+	mstartPC    uintptr
+	newprocPC   uintptr
+	rt0_goPC    uintptr
+	sigpanicPC  uintptr
+
+	externalthreadhandlerp uintptr // initialized elsewhere
+)
+
+func tracebackinit() {
+	// Go variable initialization happens late during runtime startup.
+	// Instead of initializing the variables above in the declarations,
+	// schedinit calls this function so that the variables are
+	// initialized and available earlier in the startup sequence.
+	deferprocPC = funcPC(deferproc)
+	goexitPC = funcPC(goexit)
+	jmpdeferPC = funcPC(jmpdefer)
+	mcallPC = funcPC(mcall)
+	morestackPC = funcPC(morestack)
+	mstartPC = funcPC(mstart)
+	newprocPC = funcPC(newproc)
+	rt0_goPC = funcPC(rt0_go)
+	sigpanicPC = funcPC(sigpanic)
+}
+
+// Traceback over the deferred function calls.
+// Report them like calls that have been invoked but not started executing yet.
+func tracebackdefers(gp *g, callback func(*stkframe, unsafe.Pointer) bool, v unsafe.Pointer) {
+	var frame stkframe
+	for d := gp._defer; d != nil; d = d.link {
+		fn := d.fn
+		if fn == nil {
+			// Defer of nil function. Args don't matter.
+			frame.pc = 0
+			frame.fn = nil
+			frame.argp = 0
+			frame.arglen = 0
+			frame.argmap = nil
+		} else {
+			frame.pc = uintptr(fn.fn)
+			f := findfunc(frame.pc)
+			if f == nil {
+				print("runtime: unknown pc in defer ", hex(frame.pc), "\n")
+				gothrow("unknown pc")
+			}
+			frame.fn = f
+			frame.argp = uintptr(deferArgs(d))
+			setArgInfo(&frame, f, true)
+		}
+		frame.continpc = frame.pc
+		if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
+			return
+		}
+	}
+}
+
+// Generic traceback.  Handles runtime stack prints (pcbuf == nil),
+// the runtime.Callers function (pcbuf != nil), as well as the garbage
+// collector (callback != nil).  A little clunky to merge these, but avoids
+// duplicating the code and all its subtlety.
+func gentraceback(pc0 uintptr, sp0 uintptr, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max int, callback func(*stkframe, unsafe.Pointer) bool, v unsafe.Pointer, flags uint) int {
+	if goexitPC == 0 {
+		gothrow("gentraceback before goexitPC initialization")
+	}
+	g := getg()
+	if g == gp && g == g.m.curg {
+		// The starting sp has been passed in as a uintptr, and the caller may
+		// have other uintptr-typed stack references as well.
+		// If during one of the calls that got us here or during one of the
+		// callbacks below the stack must be grown, all these uintptr references
+		// to the stack will not be updated, and gentraceback will continue
+		// to inspect the old stack memory, which may no longer be valid.
+		// Even if all the variables were updated correctly, it is not clear that
+		// we want to expose a traceback that begins on one stack and ends
+		// on another stack. That could confuse callers quite a bit.
+		// Instead, we require that gentraceback and any other function that
+		// accepts an sp for the current goroutine (typically obtained by
+		// calling getcallersp) must not run on that goroutine's stack but
+		// instead on the g0 stack.
+		gothrow("gentraceback cannot trace user goroutine on its own stack")
+	}
+	gotraceback := gotraceback(nil)
+	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
+		if gp.syscallsp != 0 {
+			pc0 = gp.syscallpc
+			sp0 = gp.syscallsp
+			if usesLR {
+				lr0 = 0
+			}
+		} else {
+			pc0 = gp.sched.pc
+			sp0 = gp.sched.sp
+			if usesLR {
+				lr0 = gp.sched.lr
+			}
+		}
+	}
+
+	nprint := 0
+	var frame stkframe
+	frame.pc = pc0
+	frame.sp = sp0
+	if usesLR {
+		frame.lr = lr0
+	}
+	waspanic := false
+	wasnewproc := false
+	printing := pcbuf == nil && callback == nil
+	_defer := gp._defer
+
+	for _defer != nil && uintptr(_defer.argp) == _NoArgs {
+		_defer = _defer.link
+	}
+
+	// If the PC is zero, it's likely a nil function call.
+	// Start in the caller's frame.
+	if frame.pc == 0 {
+		if usesLR {
+			frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp))
+			frame.lr = 0
+		} else {
+			frame.pc = uintptr(*(*uintreg)(unsafe.Pointer(frame.sp)))
+			frame.sp += regSize
+		}
+	}
+
+	f := findfunc(frame.pc)
+	if f == nil {
+		if callback != nil {
+			print("runtime: unknown pc ", hex(frame.pc), "\n")
+			gothrow("unknown pc")
+		}
+		return 0
+	}
+	frame.fn = f
+
+	n := 0
+	for n < max {
+		// Typically:
+		//	pc is the PC of the running function.
+		//	sp is the stack pointer at that program counter.
+		//	fp is the frame pointer (caller's stack pointer) at that program counter, or nil if unknown.
+		//	stk is the stack containing sp.
+		//	The caller's program counter is lr, unless lr is zero, in which case it is *(uintptr*)sp.
+		f = frame.fn
+
+		// Found an actual function.
+		// Derive frame pointer and link register.
+		if frame.fp == 0 {
+			frame.fp = frame.sp + uintptr(funcspdelta(f, frame.pc))
+			if !usesLR {
+				// On x86, call instruction pushes return PC before entering new function.
+				frame.fp += regSize
+			}
+		}
+		var flr *_func
+		if topofstack(f) {
+			frame.lr = 0
+			flr = nil
+		} else if usesLR && f.entry == jmpdeferPC {
+			// jmpdefer modifies SP/LR/PC non-atomically.
+			// If a profiling interrupt arrives during jmpdefer,
+			// the stack unwind may see a mismatched register set
+			// and get confused. Stop if we see PC within jmpdefer
+			// to avoid that confusion.
+			// See golang.org/issue/8153.
+			if callback != nil {
+				gothrow("traceback_arm: found jmpdefer when tracing with callback")
+			}
+			frame.lr = 0
+		} else {
+			if usesLR {
+				if n == 0 && frame.sp < frame.fp || frame.lr == 0 {
+					frame.lr = *(*uintptr)(unsafe.Pointer(frame.sp))
+				}
+			} else {
+				if frame.lr == 0 {
+					frame.lr = uintptr(*(*uintreg)(unsafe.Pointer(frame.fp - regSize)))
+				}
+			}
+			flr = findfunc(frame.lr)
+			if flr == nil {
+				// This happens if you get a profiling interrupt at just the wrong time.
+				// In that context it is okay to stop early.
+				// But if callback is set, we're doing a garbage collection and must
+				// get everything, so crash loudly.
+				if callback != nil {
+					print("runtime: unexpected return pc for ", gofuncname(f), " called from ", hex(frame.lr), "\n")
+					gothrow("unknown caller pc")
+				}
+			}
+		}
+
+		frame.varp = frame.fp
+		if !usesLR {
+			// On x86, call instruction pushes return PC before entering new function.
+			frame.varp -= regSize
+		}
+
+		// Derive size of arguments.
+		// Most functions have a fixed-size argument block,
+		// so we can use metadata about the function f.
+		// Not all, though: there are some variadic functions
+		// in package runtime and reflect, and for those we use call-specific
+		// metadata recorded by f's caller.
+		if callback != nil || printing {
+			frame.argp = frame.fp
+			if usesLR {
+				frame.argp += ptrSize
+			}
+			setArgInfo(&frame, f, callback != nil)
+		}
+
+		// Determine function SP where deferproc would find its arguments.
+		var sparg uintptr
+		if usesLR {
+			// On link register architectures, that's the standard bottom-of-stack plus 1 word
+			// for the saved LR. If the previous frame was a direct call to newproc/deferproc,
+			// however, the SP is three words lower than normal.
+			// If the function has no frame at all - perhaps it just started, or perhaps
+			// it is a leaf with no local variables - then we cannot possibly find its
+			// SP in a defer, and we might confuse its SP for its caller's SP, so
+			// leave sparg=0 in that case.
+			if frame.fp != frame.sp {
+				sparg = frame.sp + regSize
+				if wasnewproc {
+					sparg += 3 * regSize
+				}
+			}
+		} else {
+			// On x86 that's the standard bottom-of-stack, so SP exactly.
+			// If the previous frame was a direct call to newproc/deferproc, however,
+			// the SP is two words lower than normal.
+			sparg = frame.sp
+			if wasnewproc {
+				sparg += 2 * ptrSize
+			}
+		}
+
+		// Determine frame's 'continuation PC', where it can continue.
+		// Normally this is the return address on the stack, but if sigpanic
+		// is immediately below this function on the stack, then the frame
+		// stopped executing due to a trap, and frame.pc is probably not
+		// a safe point for looking up liveness information. In this panicking case,
+		// the function either doesn't return at all (if it has no defers or if the
+		// defers do not recover) or it returns from one of the calls to
+		// deferproc a second time (if the corresponding deferred func recovers).
+		// It suffices to assume that the most recent deferproc is the one that
+		// returns; everything live at earlier deferprocs is still live at that one.
+		frame.continpc = frame.pc
+		if waspanic {
+			if _defer != nil && _defer.argp == sparg {
+				frame.continpc = _defer.pc
+			} else {
+				frame.continpc = 0
+			}
+		}
+
+		// Unwind our local defer stack past this frame.
+		for _defer != nil && (_defer.argp == sparg || _defer.argp == _NoArgs) {
+			_defer = _defer.link
+		}
+
+		if skip > 0 {
+			skip--
+			goto skipped
+		}
+
+		if pcbuf != nil {
+			(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = frame.pc
+		}
+		if callback != nil {
+			if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
+				return n
+			}
+		}
+		if printing {
+			if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp) {
+				// Print during crash.
+				//	main(0x1, 0x2, 0x3)
+				//		/home/rsc/go/src/runtime/x.go:23 +0xf
+				//
+				tracepc := frame.pc // back up to CALL instruction for funcline.
+				if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry && !waspanic {
+					tracepc--
+				}
+				print(gofuncname(f), "(")
+				argp := (*[100]uintptr)(unsafe.Pointer(frame.argp))
+				for i := uintptr(0); i < frame.arglen/ptrSize; i++ {
+					if i >= 10 {
+						print(", ...")
+						break
+					}
+					if i != 0 {
+						print(", ")
+					}
+					print(hex(argp[i]))
+				}
+				print(")\n")
+				var file string
+				line := funcline(f, tracepc, &file)
+				print("\t", file, ":", line)
+				if frame.pc > f.entry {
+					print(" +", hex(frame.pc-f.entry))
+				}
+				if g.m.throwing > 0 && gp == g.m.curg || gotraceback >= 2 {
+					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp))
+				}
+				print("\n")
+				nprint++
+			}
+		}
+		n++
+
+	skipped:
+		waspanic = f.entry == sigpanicPC
+		wasnewproc = f.entry == newprocPC || f.entry == deferprocPC
+
+		// Do not unwind past the bottom of the stack.
+		if flr == nil {
+			break
+		}
+
+		// Unwind to next frame.
+		frame.fn = flr
+		frame.pc = frame.lr
+		frame.lr = 0
+		frame.sp = frame.fp
+		frame.fp = 0
+		frame.argmap = nil
+
+		// On link register architectures, sighandler saves the LR on stack
+		// before faking a call to sigpanic.
+		if usesLR && waspanic {
+			x := *(*uintptr)(unsafe.Pointer(frame.sp))
+			frame.sp += ptrSize
+			f = findfunc(frame.pc)
+			frame.fn = f
+			if f == nil {
+				frame.pc = x
+			} else if f.frame == 0 {
+				frame.lr = x
+			}
+		}
+	}
+
+	if pcbuf == nil && callback == nil {
+		n = nprint
+	}
+
+	// If callback != nil, we're being called to gather stack information during
+	// garbage collection or stack growth. In that context, require that we used
+	// up the entire defer stack. If not, then there is a bug somewhere and the
+	// garbage collection or stack growth may not have seen the correct picture
+	// of the stack. Crash now instead of silently executing the garbage collection
+	// or stack copy incorrectly and setting up for a mysterious crash later.
+	//
+	// Note that panic != nil is okay here: there can be leftover panics,
+	// because the defers on the panic stack do not nest in frame order as
+	// they do on the defer stack. If you have:
+	//
+	//	frame 1 defers d1
+	//	frame 2 defers d2
+	//	frame 3 defers d3
+	//	frame 4 panics
+	//	frame 4's panic starts running defers
+	//	frame 5, running d3, defers d4
+	//	frame 5 panics
+	//	frame 5's panic starts running defers
+	//	frame 6, running d4, garbage collects
+	//	frame 6, running d2, garbage collects
+	//
+	// During the execution of d4, the panic stack is d4 -> d3, which
+	// is nested properly, and we'll treat frame 3 as resumable, because we
+	// can find d3. (And in fact frame 3 is resumable. If d4 recovers
+	// and frame 5 continues running, d3, d3 can recover and we'll
+	// resume execution in (returning from) frame 3.)
+	//
+	// During the execution of d2, however, the panic stack is d2 -> d3,
+	// which is inverted. The scan will match d2 to frame 2 but having
+	// d2 on the stack until then means it will not match d3 to frame 3.
+	// This is okay: if we're running d2, then all the defers after d2 have
+	// completed and their corresponding frames are dead. Not finding d3
+	// for frame 3 means we'll set frame 3's continpc == 0, which is correct
+	// (frame 3 is dead). At the end of the walk the panic stack can thus
+	// contain defers (d3 in this case) for dead frames. The inversion here
+	// always indicates a dead frame, and the effect of the inversion on the
+	// scan is to hide those dead frames, so the scan is still okay:
+	// what's left on the panic stack are exactly (and only) the dead frames.
+	//
+	// We require callback != nil here because only when callback != nil
+	// do we know that gentraceback is being called in a "must be correct"
+	// context as opposed to a "best effort" context. The tracebacks with
+	// callbacks only happen when everything is stopped nicely.
+	// At other times, such as when gathering a stack for a profiling signal
+	// or when printing a traceback during a crash, everything may not be
+	// stopped nicely, and the stack walk may not be able to complete.
+	// It's okay in those situations not to use up the entire defer stack:
+	// incomplete information then is still better than nothing.
+	if callback != nil && n < max && _defer != nil {
+		if _defer != nil {
+			print("runtime: g", gp.goid, ": leftover defer argp=", hex(_defer.argp), " pc=", hex(_defer.pc), "\n")
+		}
+		for _defer = gp._defer; _defer != nil; _defer = _defer.link {
+			print("\tdefer ", _defer, " argp=", hex(_defer.argp), " pc=", hex(_defer.pc), "\n")
+		}
+		gothrow("traceback has leftover defers")
+	}
+
+	return n
+}
+
+func setArgInfo(frame *stkframe, f *_func, needArgMap bool) {
+	frame.arglen = uintptr(f.args)
+	if needArgMap && f.args == _ArgsSizeUnknown {
+		// Extract argument bitmaps for reflect stubs from the calls they made to reflect.
+		switch gofuncname(f) {
+		case "reflect.makeFuncStub", "reflect.methodValueCall":
+			arg0 := frame.sp
+			if usesLR {
+				arg0 += ptrSize
+			}
+			fn := *(**[2]uintptr)(unsafe.Pointer(arg0))
+			if fn[0] != f.entry {
+				print("runtime: confused by ", gofuncname(f), "\n")
+				gothrow("reflect mismatch")
+			}
+			bv := (*bitvector)(unsafe.Pointer(fn[1]))
+			frame.arglen = uintptr(bv.n / 2 * ptrSize)
+			frame.argmap = bv
+		}
+	}
+}
+
+func printcreatedby(gp *g) {
+	// Show what created goroutine, except main goroutine (goid 1).
+	pc := gp.gopc
+	f := findfunc(pc)
+	if f != nil && showframe(f, gp) && gp.goid != 1 {
+		print("created by ", gofuncname(f), "\n")
+		tracepc := pc // back up to CALL instruction for funcline.
+		if pc > f.entry {
+			tracepc -= _PCQuantum
+		}
+		var file string
+		line := funcline(f, tracepc, &file)
+		print("\t", file, ":", line)
+		if pc > f.entry {
+			print(" +", hex(pc-f.entry))
+		}
+		print("\n")
+	}
+}
+
+func traceback(pc uintptr, sp uintptr, lr uintptr, gp *g) {
+	traceback1(pc, sp, lr, gp, 0)
+}
+
+// tracebacktrap is like traceback but expects that the PC and SP were obtained
+// from a trap, not from gp->sched or gp->syscallpc/gp->syscallsp or getcallerpc/getcallersp.
+// Because they are from a trap instead of from a saved pair,
+// the initial PC must not be rewound to the previous instruction.
+// (All the saved pairs record a PC that is a return address, so we
+// rewind it into the CALL instruction.)
+func tracebacktrap(pc uintptr, sp uintptr, lr uintptr, gp *g) {
+	traceback1(pc, sp, lr, gp, _TraceTrap)
+}
+
+func traceback1(pc uintptr, sp uintptr, lr uintptr, gp *g, flags uint) {
+	var n int
+	if readgstatus(gp)&^_Gscan == _Gsyscall {
+		// Override registers if blocked in system call.
+		pc = gp.syscallpc
+		sp = gp.syscallsp
+		flags &^= _TraceTrap
+	}
+	// Print traceback. By default, omits runtime frames.
+	// If that means we print nothing at all, repeat forcing all frames printed.
+	n = gentraceback(pc, sp, lr, gp, 0, nil, _TracebackMaxFrames, nil, nil, flags)
+	if n == 0 && (flags&_TraceRuntimeFrames) == 0 {
+		n = gentraceback(pc, sp, lr, gp, 0, nil, _TracebackMaxFrames, nil, nil, flags|_TraceRuntimeFrames)
+	}
+	if n == _TracebackMaxFrames {
+		print("...additional frames elided...\n")
+	}
+	printcreatedby(gp)
+}
+
+func callers(skip int, pcbuf *uintptr, m int) int {
+	sp := getcallersp(unsafe.Pointer(&skip))
+	pc := uintptr(getcallerpc(unsafe.Pointer(&skip)))
+	var n int
+	onM(func() {
+		n = gentraceback(pc, sp, 0, getg(), skip, pcbuf, m, nil, nil, 0)
+	})
+	return n
+}
+
+func gcallers(gp *g, skip int, pcbuf *uintptr, m int) int {
+	return gentraceback(^uintptr(0), ^uintptr(0), 0, gp, skip, pcbuf, m, nil, nil, 0)
+}
+
+func showframe(f *_func, gp *g) bool {
+	g := getg()
+	if g.m.throwing > 0 && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig) {
+		return true
+	}
+	traceback := gotraceback(nil)
+	name := gostringnocopy(funcname(f))
+
+	// Special case: always show runtime.panic frame, so that we can
+	// see where a panic started in the middle of a stack trace.
+	// See golang.org/issue/5832.
+	if name == "runtime.panic" {
+		return true
+	}
+
+	return traceback > 1 || f != nil && contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
+}
+
+// isExportedRuntime reports whether name is an exported runtime function.
+// It is only for runtime functions, so ASCII A-Z is fine.
+func isExportedRuntime(name string) bool {
+	const n = len("runtime.")
+	return len(name) > n && name[:n] == "runtime." && 'A' <= name[n] && name[n] <= 'Z'
+}
+
+var gStatusStrings = [...]string{
+	_Gidle:      "idle",
+	_Grunnable:  "runnable",
+	_Grunning:   "running",
+	_Gsyscall:   "syscall",
+	_Gwaiting:   "waiting",
+	_Gdead:      "dead",
+	_Genqueue:   "enqueue",
+	_Gcopystack: "copystack",
+}
+
+var gScanStatusStrings = [...]string{
+	0:          "scan",
+	_Grunnable: "scanrunnable",
+	_Grunning:  "scanrunning",
+	_Gsyscall:  "scansyscall",
+	_Gwaiting:  "scanwaiting",
+	_Gdead:     "scandead",
+	_Genqueue:  "scanenqueue",
+}
+
+func goroutineheader(gp *g) {
+	gpstatus := readgstatus(gp)
+
+	// Basic string status
+	var status string
+	if 0 <= gpstatus && gpstatus < uint32(len(gStatusStrings)) {
+		status = gStatusStrings[gpstatus]
+	} else if gpstatus&_Gscan != 0 && 0 <= gpstatus&^_Gscan && gpstatus&^_Gscan < uint32(len(gStatusStrings)) {
+		status = gStatusStrings[gpstatus&^_Gscan]
+	} else {
+		status = "???"
+	}
+
+	// Override.
+	if (gpstatus == _Gwaiting || gpstatus == _Gscanwaiting) && gp.waitreason != "" {
+		status = gp.waitreason
+	}
+
+	// approx time the G is blocked, in minutes
+	var waitfor int64
+	gpstatus &^= _Gscan // drop the scan bit
+	if (gpstatus == _Gwaiting || gpstatus == _Gsyscall) && gp.waitsince != 0 {
+		waitfor = (nanotime() - gp.waitsince) / 60e9
+	}
+	print("goroutine ", gp.goid, " [", status)
+	if waitfor >= 1 {
+		print(", ", waitfor, " minutes")
+	}
+	if gp.lockedm != nil {
+		print(", locked to thread")
+	}
+	print("]:\n")
+}
+
+func tracebackothers(me *g) {
+	level := gotraceback(nil)
+
+	// Show the current goroutine first, if we haven't already.
+	g := getg()
+	gp := g.m.curg
+	if gp != nil && gp != me {
+		print("\n")
+		goroutineheader(gp)
+		traceback(^uintptr(0), ^uintptr(0), 0, gp)
+	}
+
+	lock(&allglock)
+	for _, gp := range allgs {
+		if gp == me || gp == g.m.curg || readgstatus(gp) == _Gdead || gp.issystem && level < 2 {
+			continue
+		}
+		print("\n")
+		goroutineheader(gp)
+		if readgstatus(gp)&^_Gscan == _Grunning {
+			print("\tgoroutine running on other thread; stack unavailable\n")
+			printcreatedby(gp)
+		} else {
+			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+		}
+	}
+	unlock(&allglock)
+}
+
+// Does f mark the top of a goroutine stack?
+func topofstack(f *_func) bool {
+	pc := f.entry
+	return pc == goexitPC ||
+		pc == mstartPC ||
+		pc == mcallPC ||
+		pc == morestackPC ||
+		pc == rt0_goPC ||
+		externalthreadhandlerp != 0 && pc == externalthreadhandlerp
+}
diff --git a/src/runtime/type.h b/src/runtime/type.h
new file mode 100644
index 000000000..f5b4f9d13
--- /dev/null
+++ b/src/runtime/type.h
@@ -0,0 +1,113 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Runtime type representation.
+
+typedef struct Type Type;
+typedef struct UncommonType UncommonType;
+typedef struct InterfaceType InterfaceType;
+typedef struct Method Method;
+typedef struct IMethod IMethod;
+typedef struct SliceType SliceType;
+typedef struct FuncType FuncType;
+
+// Needs to be in sync with ../../cmd/ld/decodesym.c:/^commonsize and pkg/reflect/type.go:/type.
+struct Type
+{
+	uintptr size;
+	uint32 hash;
+	uint8 _unused;
+	uint8 align;
+	uint8 fieldAlign;
+	uint8 kind;
+	void* alg;
+	// gc stores type info required for garbage collector.
+	// If (kind&KindGCProg)==0, then gc[0] points at sparse GC bitmap
+	// (no indirection), 4 bits per word.
+	// If (kind&KindGCProg)!=0, then gc[1] points to a compiler-generated
+	// read-only GC program; and gc[0] points to BSS space for sparse GC bitmap.
+	// For huge types (>MaxGCMask), runtime unrolls the program directly into
+	// GC bitmap and gc[0] is not used. For moderately-sized types, runtime
+	// unrolls the program into gc[0] space on first use. The first byte of gc[0]
+	// (gc[0][0]) contains 'unroll' flag saying whether the program is already
+	// unrolled into gc[0] or not.
+	uintptr gc[2];
+	String *string;
+	UncommonType *x;
+	Type *ptrto;
+	byte *zero;  // ptr to the zero value for this type
+};
+
+struct Method
+{
+	String *name;
+	String *pkgPath;
+	Type	*mtyp;
+	Type *typ;
+	void (*ifn)(void);
+	void (*tfn)(void);
+};
+
+struct UncommonType
+{
+	String *name;
+	String *pkgPath;
+	Slice mhdr;
+	Method m[];
+};
+
+struct IMethod
+{
+	String *name;
+	String *pkgPath;
+	Type *type;
+};
+
+struct InterfaceType
+{
+	Type  typ;
+	Slice mhdr;
+	IMethod m[];
+};
+
+struct MapType
+{
+	Type typ;
+	Type *key;
+	Type *elem;
+	Type *bucket;		// internal type representing a hash bucket
+	Type *hmap;		// internal type representing a Hmap
+	uint8 keysize;		// size of key slot
+	bool indirectkey;	// store ptr to key instead of key itself
+	uint8 valuesize;	// size of value slot
+	bool indirectvalue;	// store ptr to value instead of value itself
+	uint16 bucketsize;	// size of bucket
+};
+
+struct ChanType
+{
+	Type typ;
+	Type *elem;
+	uintptr dir;
+};
+
+struct SliceType
+{
+	Type typ;
+	Type *elem;
+};
+
+struct FuncType
+{
+	Type typ;
+	bool dotdotdot;
+	Slice in;
+	Slice out;
+};
+
+struct PtrType
+{
+	Type typ;
+	Type *elem;
+};
diff --git a/src/runtime/typekind.go b/src/runtime/typekind.go
new file mode 100644
index 000000000..b64ec44f9
--- /dev/null
+++ b/src/runtime/typekind.go
@@ -0,0 +1,44 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	kindBool          = _KindBool
+	kindInt           = _KindInt
+	kindInt8          = _KindInt8
+	kindInt16         = _KindInt16
+	kindInt32         = _KindInt32
+	kindInt64         = _KindInt64
+	kindUint          = _KindUint
+	kindUint8         = _KindUint8
+	kindUint16        = _KindUint16
+	kindUint32        = _KindUint32
+	kindUint64        = _KindUint64
+	kindUintptr       = _KindUintptr
+	kindFloat32       = _KindFloat32
+	kindFloat64       = _KindFloat64
+	kindComplex64     = _KindComplex64
+	kindComplex128    = _KindComplex128
+	kindArray         = _KindArray
+	kindChan          = _KindChan
+	kindFunc          = _KindFunc
+	kindInterface     = _KindInterface
+	kindMap           = _KindMap
+	kindPtr           = _KindPtr
+	kindSlice         = _KindSlice
+	kindString        = _KindString
+	kindStruct        = _KindStruct
+	kindUnsafePointer = _KindUnsafePointer
+
+	kindDirectIface = _KindDirectIface
+	kindGCProg      = _KindGCProg
+	kindNoPointers  = _KindNoPointers
+	kindMask        = _KindMask
+)
+
+// isDirectIface reports whether t is stored directly in an interface value.
+func isDirectIface(t *_type) bool {
+	return t.kind&kindDirectIface != 0
+}
diff --git a/src/runtime/typekind.h b/src/runtime/typekind.h
new file mode 100644
index 000000000..e0fe177bb
--- /dev/null
+++ b/src/runtime/typekind.h
@@ -0,0 +1,38 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+enum {
+	KindBool = 1,
+	KindInt,
+	KindInt8,
+	KindInt16,
+	KindInt32,
+	KindInt64,
+	KindUint,
+	KindUint8,
+	KindUint16,
+	KindUint32,
+	KindUint64,
+	KindUintptr,
+	KindFloat32,
+	KindFloat64,
+	KindComplex64,
+	KindComplex128,
+	KindArray,
+	KindChan,
+	KindFunc,
+	KindInterface,
+	KindMap,
+	KindPtr,
+	KindSlice,
+	KindString,
+	KindStruct,
+	KindUnsafePointer,
+
+	KindDirectIface = 1<<5,
+	KindGCProg = 1<<6,	// Type.gc points to GC program
+	KindNoPointers = 1<<7,
+	KindMask = (1<<5)-1,
+};
+
diff --git a/src/runtime/vdso_linux_amd64.c b/src/runtime/vdso_linux_amd64.c
new file mode 100644
index 000000000..681340c5b
--- /dev/null
+++ b/src/runtime/vdso_linux_amd64.c
@@ -0,0 +1,371 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "textflag.h"
+
+// Look up symbols in the Linux vDSO.
+
+// This code was originally based on the sample Linux vDSO parser at
+// https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/vDSO/parse_vdso.c
+
+// This implements the ELF dynamic linking spec at
+// http://sco.com/developers/gabi/latest/ch5.dynamic.html
+
+// The version section is documented at
+// http://refspecs.linuxfoundation.org/LSB_3.2.0/LSB-Core-generic/LSB-Core-generic/symversion.html
+
+#define AT_RANDOM 25
+#define AT_SYSINFO_EHDR 33
+#define AT_NULL	0    /* End of vector */
+#define PT_LOAD	1    /* Loadable program segment */
+#define PT_DYNAMIC 2 /* Dynamic linking information */
+#define DT_NULL 0    /* Marks end of dynamic section */
+#define DT_HASH 4    /* Dynamic symbol hash table */
+#define DT_STRTAB 5  /* Address of string table */
+#define DT_SYMTAB 6  /* Address of symbol table */
+#define DT_VERSYM 0x6ffffff0
+#define	DT_VERDEF 0x6ffffffc
+
+#define VER_FLG_BASE 0x1 /* Version definition of file itself */
+#define SHN_UNDEF 0      /* Undefined section */
+#define SHT_DYNSYM 11    /* Dynamic linker symbol table */
+#define STT_FUNC 2       /* Symbol is a code object */
+#define STB_GLOBAL 1     /* Global symbol */
+#define STB_WEAK 2       /* Weak symbol */
+
+/* How to extract and insert information held in the st_info field.  */
+#define ELF64_ST_BIND(val) (((byte) (val)) >> 4)
+#define ELF64_ST_TYPE(val) ((val) & 0xf)
+
+#define EI_NIDENT (16)
+
+typedef uint16 Elf64_Half;
+typedef uint32 Elf64_Word;
+typedef	int32  Elf64_Sword;
+typedef uint64 Elf64_Xword;
+typedef	int64  Elf64_Sxword;
+typedef uint64 Elf64_Addr;
+typedef uint64 Elf64_Off;
+typedef uint16 Elf64_Section;
+typedef Elf64_Half Elf64_Versym;
+
+
+typedef struct Elf64_Sym
+{
+	Elf64_Word st_name;
+	byte st_info;
+	byte st_other;
+	Elf64_Section st_shndx;
+	Elf64_Addr st_value;
+	Elf64_Xword st_size;
+} Elf64_Sym;
+
+typedef struct Elf64_Verdef
+{
+	Elf64_Half vd_version; /* Version revision */
+	Elf64_Half vd_flags;   /* Version information */
+	Elf64_Half vd_ndx;     /* Version Index */
+	Elf64_Half vd_cnt;     /* Number of associated aux entries */
+	Elf64_Word vd_hash;    /* Version name hash value */
+	Elf64_Word vd_aux;     /* Offset in bytes to verdaux array */
+	Elf64_Word vd_next;    /* Offset in bytes to next verdef entry */
+} Elf64_Verdef;
+
+typedef struct Elf64_Ehdr
+{
+	byte e_ident[EI_NIDENT]; /* Magic number and other info */
+	Elf64_Half e_type;       /* Object file type */
+	Elf64_Half e_machine;    /* Architecture */
+	Elf64_Word e_version;    /* Object file version */
+	Elf64_Addr e_entry;      /* Entry point virtual address */
+	Elf64_Off e_phoff;       /* Program header table file offset */
+	Elf64_Off e_shoff;       /* Section header table file offset */
+	Elf64_Word e_flags;      /* Processor-specific flags */
+	Elf64_Half e_ehsize;     /* ELF header size in bytes */
+	Elf64_Half e_phentsize;  /* Program header table entry size */
+	Elf64_Half e_phnum;      /* Program header table entry count */
+	Elf64_Half e_shentsize;  /* Section header table entry size */
+	Elf64_Half e_shnum;      /* Section header table entry count */
+	Elf64_Half e_shstrndx;   /* Section header string table index */
+} Elf64_Ehdr;
+
+typedef struct Elf64_Phdr
+{
+	Elf64_Word p_type;    /* Segment type */
+	Elf64_Word p_flags;   /* Segment flags */
+	Elf64_Off p_offset;   /* Segment file offset */
+	Elf64_Addr p_vaddr;   /* Segment virtual address */
+	Elf64_Addr p_paddr;   /* Segment physical address */
+	Elf64_Xword p_filesz; /* Segment size in file */
+	Elf64_Xword p_memsz;  /* Segment size in memory */
+	Elf64_Xword p_align;  /* Segment alignment */
+} Elf64_Phdr;
+
+typedef struct Elf64_Shdr
+{
+	Elf64_Word sh_name;       /* Section name (string tbl index) */
+	Elf64_Word sh_type;       /* Section type */
+	Elf64_Xword sh_flags;     /* Section flags */
+	Elf64_Addr sh_addr;       /* Section virtual addr at execution */
+	Elf64_Off sh_offset;      /* Section file offset */
+	Elf64_Xword sh_size;      /* Section size in bytes */
+	Elf64_Word sh_link;       /* Link to another section */
+	Elf64_Word sh_info;       /* Additional section information */
+	Elf64_Xword sh_addralign; /* Section alignment */
+	Elf64_Xword sh_entsize;   /* Entry size if section holds table */
+} Elf64_Shdr;
+
+typedef struct Elf64_Dyn
+{
+	Elf64_Sxword d_tag; /* Dynamic entry type */
+	union
+	{
+		Elf64_Xword d_val;  /* Integer value */
+		Elf64_Addr d_ptr;   /* Address value */
+	} d_un;
+} Elf64_Dyn;
+
+typedef struct Elf64_Verdaux
+{
+	Elf64_Word vda_name; /* Version or dependency names */
+	Elf64_Word vda_next; /* Offset in bytes to next verdaux entry */
+} Elf64_Verdaux;
+
+typedef struct Elf64_auxv_t
+{
+	uint64 a_type;        /* Entry type */
+	union
+	{
+		uint64 a_val; /* Integer value */
+	} a_un;
+} Elf64_auxv_t;
+
+
+typedef struct symbol_key {
+	byte* name;
+	int32 sym_hash;
+	void** var_ptr;
+} symbol_key;
+
+typedef struct version_key {
+	byte* version;
+	int32 ver_hash;
+} version_key;
+
+struct vdso_info {
+	bool valid;
+
+	/* Load information */
+	uintptr load_addr;
+	uintptr load_offset;  /* load_addr - recorded vaddr */
+
+	/* Symbol table */
+	Elf64_Sym *symtab;
+	const byte *symstrings;
+	Elf64_Word *bucket, *chain;
+	Elf64_Word nbucket, nchain;
+
+	/* Version table */
+	Elf64_Versym *versym;
+	Elf64_Verdef *verdef;
+};
+
+#pragma dataflag NOPTR
+static version_key linux26 = { (byte*)"LINUX_2.6", 0x3ae75f6 };
+
+// initialize with vsyscall fallbacks
+#pragma dataflag NOPTR
+void* runtime·__vdso_time_sym = (void*)0xffffffffff600400ULL;
+#pragma dataflag NOPTR
+void* runtime·__vdso_gettimeofday_sym = (void*)0xffffffffff600000ULL;
+#pragma dataflag NOPTR
+void* runtime·__vdso_clock_gettime_sym = (void*)0;
+
+#pragma dataflag NOPTR
+static symbol_key sym_keys[] = {
+	{ (byte*)"__vdso_time", 0xa33c485, &runtime·__vdso_time_sym },
+	{ (byte*)"__vdso_gettimeofday", 0x315ca59, &runtime·__vdso_gettimeofday_sym },
+	{ (byte*)"__vdso_clock_gettime", 0xd35ec75, &runtime·__vdso_clock_gettime_sym },
+};
+
+static void
+vdso_init_from_sysinfo_ehdr(struct vdso_info *vdso_info, Elf64_Ehdr* hdr)
+{
+	uint64 i;
+	bool found_vaddr = false;
+	Elf64_Phdr *pt;
+	Elf64_Dyn *dyn;
+	Elf64_Word *hash;
+
+	vdso_info->valid = false;
+	vdso_info->load_addr = (uintptr) hdr;
+
+	pt = (Elf64_Phdr*)(vdso_info->load_addr + hdr->e_phoff);
+	dyn = nil;
+
+	// We need two things from the segment table: the load offset
+	// and the dynamic table.
+	for(i=0; i<hdr->e_phnum; i++) {
+		if(pt[i].p_type == PT_LOAD && found_vaddr == false) {
+			found_vaddr = true;
+			vdso_info->load_offset =	(uintptr)hdr
+				+ (uintptr)pt[i].p_offset
+				- (uintptr)pt[i].p_vaddr;
+		} else if(pt[i].p_type == PT_DYNAMIC) {
+			dyn = (Elf64_Dyn*)((uintptr)hdr + pt[i].p_offset);
+		}
+	}
+
+	if(found_vaddr == false || dyn == nil)
+		return;  // Failed
+
+	// Fish out the useful bits of the dynamic table.
+	hash = nil;
+	vdso_info->symstrings = nil;
+	vdso_info->symtab = nil;
+	vdso_info->versym = nil;
+	vdso_info->verdef = nil;
+	for(i=0; dyn[i].d_tag!=DT_NULL; i++) {
+		switch(dyn[i].d_tag) {
+		case DT_STRTAB:
+			vdso_info->symstrings = (const byte *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		case DT_SYMTAB:
+			vdso_info->symtab = (Elf64_Sym *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		case DT_HASH:
+			hash = (Elf64_Word *)
+			  ((uintptr)dyn[i].d_un.d_ptr
+			   + vdso_info->load_offset);
+			break;
+		case DT_VERSYM:
+			vdso_info->versym = (Elf64_Versym *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		case DT_VERDEF:
+			vdso_info->verdef = (Elf64_Verdef *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		}
+	}
+	if(vdso_info->symstrings == nil || vdso_info->symtab == nil || hash == nil)
+		return;  // Failed
+
+	if(vdso_info->verdef == nil)
+		vdso_info->versym = 0;
+
+	// Parse the hash table header.
+	vdso_info->nbucket = hash[0];
+	vdso_info->nchain = hash[1];
+	vdso_info->bucket = &hash[2];
+	vdso_info->chain = &hash[vdso_info->nbucket + 2];
+
+	// That's all we need.
+	vdso_info->valid = true;
+}
+
+static int32
+vdso_find_version(struct vdso_info *vdso_info, version_key* ver)
+{
+	if(vdso_info->valid == false) {
+		return 0;
+	}
+	Elf64_Verdef *def = vdso_info->verdef;
+	while(true) {
+		if((def->vd_flags & VER_FLG_BASE) == 0) {
+			Elf64_Verdaux *aux = (Elf64_Verdaux*)((byte *)def + def->vd_aux);
+			if(def->vd_hash == ver->ver_hash &&
+				runtime·strcmp(ver->version, vdso_info->symstrings + aux->vda_name) == 0) {
+				return def->vd_ndx & 0x7fff;
+			}
+		}
+
+		if(def->vd_next == 0) {
+			break;
+		}
+		def = (Elf64_Verdef *)((byte *)def + def->vd_next);
+	}
+	return -1; // can not match any version
+}
+
+static void
+vdso_parse_symbols(struct vdso_info *vdso_info, int32 version)
+{
+	int32 i;
+	Elf64_Word chain;
+	Elf64_Sym *sym;
+
+	if(vdso_info->valid == false)
+		return;
+
+	for(i=0; i<nelem(sym_keys); i++) {
+		for(chain = vdso_info->bucket[sym_keys[i].sym_hash % vdso_info->nbucket];
+			chain != 0; chain = vdso_info->chain[chain]) {
+
+			sym = &vdso_info->symtab[chain];
+			if(ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
+				continue;
+			if(ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
+				 ELF64_ST_BIND(sym->st_info) != STB_WEAK)
+				continue;
+			if(sym->st_shndx == SHN_UNDEF)
+				continue;
+			if(runtime·strcmp(sym_keys[i].name, vdso_info->symstrings + sym->st_name) != 0)
+				continue;
+
+			// Check symbol version.
+			if(vdso_info->versym != nil && version != 0
+				&& vdso_info->versym[chain] & 0x7fff != version)
+				continue;
+
+			*sym_keys[i].var_ptr = (void *)(vdso_info->load_offset + sym->st_value);
+			break;
+		}
+	}
+}
+
+static void
+runtime·linux_setup_vdso(int32 argc, uint8** argv)
+{
+	struct vdso_info vdso_info;
+
+	// skip argvc
+	byte **p = argv;
+	p = &p[argc+1];
+
+	// skip envp to get to ELF auxiliary vector.
+	for(; *p!=0; p++) {}
+
+	// skip NULL separator
+	p++;
+
+	// now, p points to auxv
+	Elf64_auxv_t *elf_auxv = (Elf64_auxv_t*) p;
+
+	for(int32 i=0; elf_auxv[i].a_type!=AT_NULL; i++) {
+		if(elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
+			if(elf_auxv[i].a_un.a_val == 0) {
+				// Something went wrong
+				continue;
+			}
+			vdso_init_from_sysinfo_ehdr(&vdso_info, (Elf64_Ehdr*)elf_auxv[i].a_un.a_val);
+			vdso_parse_symbols(&vdso_info, vdso_find_version(&vdso_info, &linux26));
+			continue;
+		}
+		if(elf_auxv[i].a_type == AT_RANDOM) {
+		        runtime·startup_random_data = (byte*)elf_auxv[i].a_un.a_val;
+		        runtime·startup_random_data_len = 16;
+			continue;
+		}
+	}
+}
+
+void (*runtime·sysargs)(int32, uint8**) = runtime·linux_setup_vdso;
diff --git a/src/runtime/vlop_386.s b/src/runtime/vlop_386.s
new file mode 100644
index 000000000..ce8e7d064
--- /dev/null
+++ b/src/runtime/vlop_386.s
@@ -0,0 +1,56 @@
+// Inferno's libkern/vlop-386.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/vlop-386.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "textflag.h"
+
+/*
+ * C runtime for 64-bit divide.
+ */
+
+// runtime·_mul64x32(r *uint64, a uint64, b uint32) uint32
+// sets *r = low 64 bits of 96-bit product a*b; returns high 32 bits.
+TEXT runtime·_mul64by32(SB), NOSPLIT, $0
+	MOVL	r+0(FP), CX
+	MOVL	a+4(FP), AX
+	MULL	b+12(FP)
+	MOVL	AX, 0(CX)
+	MOVL	DX, BX
+	MOVL	a+8(FP), AX
+	MULL	b+12(FP)
+	ADDL	AX, BX
+	ADCL	$0, DX
+	MOVL	BX, 4(CX)
+	MOVL	DX, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime·_div64by32(SB), NOSPLIT, $0
+	MOVL	r+12(FP), CX
+	MOVL	a+0(FP), AX
+	MOVL	a+4(FP), DX
+	DIVL	b+8(FP)
+	MOVL	DX, 0(CX)
+	MOVL	AX, ret+16(FP)
+	RET
diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
new file mode 100644
index 000000000..b4b905bb7
--- /dev/null
+++ b/src/runtime/vlop_arm.s
@@ -0,0 +1,317 @@
+// Inferno's libkern/vlop-arm.s
+// http://code.google.com/p/inferno-os/source/browse/libkern/vlop-arm.s
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "zasm_GOOS_GOARCH.h"
+#include "textflag.h"
+
+arg=0
+
+/* replaced use of R10 by R11 because the former can be the data segment base register */
+
+TEXT _mulv(SB), NOSPLIT, $0
+	MOVW	l0+0(FP), R2	/* l0 */
+	MOVW	h0+4(FP), R11	/* h0 */
+	MOVW	l1+8(FP), R4	/* l1 */
+	MOVW	h1+12(FP), R5	/* h1 */
+	MULLU	R4, R2, (R7,R6)
+	MUL	R11, R4, R8
+	ADD	R8, R7
+	MUL	R2, R5, R8
+	ADD	R8, R7
+	MOVW	R6, ret_lo+16(FP)
+	MOVW	R7, ret_hi+20(FP)
+	RET
+
+// trampoline for _sfloat2. passes LR as arg0 and
+// saves registers R0-R13 and CPSR on the stack. R0-R12 and CPSR flags can
+// be changed by _sfloat2.
+TEXT _sfloat(SB), NOSPLIT, $68-0 // 4 arg + 14*4 saved regs + cpsr + return value
+	MOVW	R14, 4(R13)
+	MOVW	R0, 8(R13)
+	MOVW	$12(R13), R0
+	MOVM.IA.W	[R1-R12], (R0)
+	MOVW	$72(R13), R1 // correct for frame size
+	MOVW	R1, 60(R13)
+	WORD	$0xe10f1000 // mrs r1, cpsr
+	MOVW	R1, 64(R13)
+	// Disable preemption of this goroutine during _sfloat2 by
+	// m->locks++ and m->locks-- around the call.
+	// Rescheduling this goroutine may cause the loss of the
+	// contents of the software floating point registers in 
+	// m->freghi, m->freglo, m->fflag, if the goroutine is moved
+	// to a different m or another goroutine runs on this m.
+	// Rescheduling at ordinary function calls is okay because
+	// all registers are caller save, but _sfloat2 and the things
+	// that it runs are simulating the execution of individual
+	// program instructions, and those instructions do not expect
+	// the floating point registers to be lost.
+	// An alternative would be to move the software floating point
+	// registers into G, but they do not need to be kept at the 
+	// usual places a goroutine reschedules (at function calls),
+	// so it would be a waste of 132 bytes per G.
+	MOVW	g_m(g), R8
+	MOVW	m_locks(R8), R1
+	ADD	$1, R1
+	MOVW	R1, m_locks(R8)
+	MOVW	$1, R1
+	MOVW	R1, m_softfloat(R8)
+	BL	runtime·_sfloat2(SB)
+	MOVW	68(R13), R0
+	MOVW	g_m(g), R8
+	MOVW	m_locks(R8), R1
+	SUB	$1, R1
+	MOVW	R1, m_locks(R8)
+	MOVW	$0, R1
+	MOVW	R1, m_softfloat(R8)
+	MOVW	R0, 0(R13)
+	MOVW	64(R13), R1
+	WORD	$0xe128f001	// msr cpsr_f, r1
+	MOVW	$12(R13), R0
+	// Restore R1-R12, R0.
+	MOVM.IA.W	(R0), [R1-R12]
+	MOVW	8(R13), R0
+	RET
+
+// trampoline for _sfloat2 panic.
+// _sfloat2 instructs _sfloat to return here.
+// We need to push a fake saved LR onto the stack,
+// load the signal fault address into LR, and jump
+// to the real sigpanic.
+// This simulates what sighandler does for a memory fault.
+TEXT _sfloatpanic(SB),NOSPLIT,$-4
+	MOVW	$0, R0
+	MOVW.W	R0, -4(R13)
+	MOVW	g_sigpc(g), LR
+	B	runtime·sigpanic(SB)
+
+// func udiv(n, d uint32) (q, r uint32)
+// Reference: 
+// Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
+// Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
+q = 0 // input d, output q
+r = 1 // input n, output r
+s = 2 // three temporary variables
+M = 3
+a = 11
+// Be careful: R(a) == R11 will be used by the linker for synthesized instructions.
+TEXT udiv<>(SB),NOSPLIT,$-4
+	CLZ 	R(q), R(s) // find normalizing shift
+	MOVW.S	R(q)<<R(s), R(a)
+	MOVW	$fast_udiv_tab<>-64(SB), R(M)
+	ADD.NE	R(a)>>25, R(M), R(a) // index by most significant 7 bits of divisor
+	MOVBU.NE	(R(a)), R(a)
+
+	SUB.S	$7, R(s)
+	RSB 	$0, R(q), R(M) // M = -q
+	MOVW.PL	R(a)<<R(s), R(q)
+
+	// 1st Newton iteration
+	MUL.PL	R(M), R(q), R(a) // a = -q*d
+	BMI 	udiv_by_large_d
+	MULAWT	R(a), R(q), R(q), R(q) // q approx q-(q*q*d>>32)
+	TEQ 	R(M)->1, R(M) // check for d=0 or d=1
+
+	// 2nd Newton iteration
+	MUL.NE	R(M), R(q), R(a)
+	MOVW.NE	$0, R(s)
+	MULAL.NE R(q), R(a), (R(q),R(s))
+	BEQ 	udiv_by_0_or_1
+
+	// q now accurate enough for a remainder r, 0<=r<3*d
+	MULLU	R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32	
+	ADD 	R(M), R(r), R(r) // r = n - d
+	MULA	R(M), R(q), R(r), R(r) // r = n - (q+1)*d
+
+	// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
+	CMN 	R(M), R(r) // t = r-d
+	SUB.CS	R(M), R(r), R(r) // if (t<-d || t>=0) r=r+d
+	ADD.CC	$1, R(q)
+	ADD.PL	R(M)<<1, R(r)
+	ADD.PL	$2, R(q)
+	RET
+
+udiv_by_large_d:
+	// at this point we know d>=2^(31-6)=2^25
+	SUB 	$4, R(a), R(a)
+	RSB 	$0, R(s), R(s)
+	MOVW	R(a)>>R(s), R(q)
+	MULLU	R(q), R(r), (R(q),R(s))
+	MULA	R(M), R(q), R(r), R(r)
+
+	// q now accurate enough for a remainder r, 0<=r<4*d
+	CMN 	R(r)>>1, R(M) // if(r/2 >= d)
+	ADD.CS	R(M)<<1, R(r)
+	ADD.CS	$2, R(q)
+	CMN 	R(r), R(M)
+	ADD.CS	R(M), R(r)
+	ADD.CS	$1, R(q)
+	RET
+
+udiv_by_0_or_1:
+	// carry set if d==1, carry clear if d==0
+	BCC udiv_by_0
+	MOVW	R(r), R(q)
+	MOVW	$0, R(r)
+	RET
+
+udiv_by_0:
+	// The ARM toolchain expects it can emit references to DIV and MOD
+	// instructions. The linker rewrites each pseudo-instruction into
+	// a sequence that pushes two values onto the stack and then calls
+	// _divu, _modu, _div, or _mod (below), all of which have a 16-byte
+	// frame plus the saved LR. The traceback routine knows the expanded
+	// stack frame size at the pseudo-instruction call site, but it
+	// doesn't know that the frame has a non-standard layout. In particular,
+	// it expects to find a saved LR in the bottom word of the frame.
+	// Unwind the stack back to the pseudo-instruction call site, copy the
+	// saved LR where the traceback routine will look for it, and make it
+	// appear that panicdivide was called from that PC.
+	MOVW	0(R13), LR
+	ADD	$20, R13
+	MOVW	8(R13), R1 // actual saved LR
+	MOVW	R1, 0(R13) // expected here for traceback
+	B 	runtime·panicdivide(SB)
+
+// var tab [64]byte
+// tab[0] = 255; for i := 1; i <= 63; i++ { tab[i] = (1<<14)/(64+i) }
+// laid out here as little-endian uint32s
+DATA fast_udiv_tab<>+0x00(SB)/4, $0xf4f8fcff
+DATA fast_udiv_tab<>+0x04(SB)/4, $0xe6eaedf0
+DATA fast_udiv_tab<>+0x08(SB)/4, $0xdadde0e3
+DATA fast_udiv_tab<>+0x0c(SB)/4, $0xcfd2d4d7
+DATA fast_udiv_tab<>+0x10(SB)/4, $0xc5c7cacc
+DATA fast_udiv_tab<>+0x14(SB)/4, $0xbcbec0c3
+DATA fast_udiv_tab<>+0x18(SB)/4, $0xb4b6b8ba
+DATA fast_udiv_tab<>+0x1c(SB)/4, $0xacaeb0b2
+DATA fast_udiv_tab<>+0x20(SB)/4, $0xa5a7a8aa
+DATA fast_udiv_tab<>+0x24(SB)/4, $0x9fa0a2a3
+DATA fast_udiv_tab<>+0x28(SB)/4, $0x999a9c9d
+DATA fast_udiv_tab<>+0x2c(SB)/4, $0x93949697
+DATA fast_udiv_tab<>+0x30(SB)/4, $0x8e8f9092
+DATA fast_udiv_tab<>+0x34(SB)/4, $0x898a8c8d
+DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788
+DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384
+GLOBL fast_udiv_tab<>(SB), RODATA, $64
+
+// The linker will pass numerator in R(TMP), and it also
+// expects the result in R(TMP)
+TMP = 11
+
+TEXT _divu(SB), NOSPLIT, $16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(M), 16(R13)
+
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	BL  	udiv<>(SB)
+	MOVW	R(q), R(TMP)
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(M)
+	RET
+
+TEXT _modu(SB), NOSPLIT, $16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(M), 16(R13)
+
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	BL  	udiv<>(SB)
+	MOVW	R(r), R(TMP)
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(M)
+	RET
+
+TEXT _div(SB),NOSPLIT,$16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(M), 16(R13)
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	CMP 	$0, R(r)
+	BGE 	d1
+	RSB 	$0, R(r), R(r)
+	CMP 	$0, R(q)
+	BGE 	d2
+	RSB 	$0, R(q), R(q)
+d0:
+	BL  	udiv<>(SB)  		/* none/both neg */
+	MOVW	R(q), R(TMP)
+	B		out1
+d1:
+	CMP 	$0, R(q)
+	BGE 	d0
+	RSB 	$0, R(q), R(q)
+d2:
+	BL  	udiv<>(SB)  		/* one neg */
+	RSB		$0, R(q), R(TMP)
+out1:
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(M)
+	RET
+
+TEXT _mod(SB),NOSPLIT,$16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(M), 16(R13)
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	CMP 	$0, R(q)
+	RSB.LT	$0, R(q), R(q)
+	CMP 	$0, R(r)
+	BGE 	m1
+	RSB 	$0, R(r), R(r)
+	BL  	udiv<>(SB)  		/* neg numerator */
+	RSB 	$0, R(r), R(TMP)
+	B   	out
+m1:
+	BL  	udiv<>(SB)  		/* pos numerator */
+	MOVW	R(r), R(TMP)
+out:
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(M)
+	RET
+
+// _mul64by32 and _div64by32 not implemented on arm
+TEXT runtime·_mul64by32(SB), NOSPLIT, $0
+	MOVW	$0, R0
+	MOVW	(R0), R1 // crash
+
+TEXT runtime·_div64by32(SB), NOSPLIT, $0
+	MOVW	$0, R0
+	MOVW	(R0), R1 // crash
diff --git a/src/runtime/vlop_arm_test.go b/src/runtime/vlop_arm_test.go
new file mode 100644
index 000000000..cd28419ad
--- /dev/null
+++ b/src/runtime/vlop_arm_test.go
@@ -0,0 +1,70 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import "testing"
+
+// arm soft division benchmarks adapted from
+// http://ridiculousfish.com/files/division_benchmarks.tar.gz
+
+const numeratorsSize = 1 << 21
+
+var numerators = randomNumerators()
+
+type randstate struct {
+	hi, lo uint32
+}
+
+func (r *randstate) rand() uint32 {
+	r.hi = r.hi<<16 + r.hi>>16
+	r.hi += r.lo
+	r.lo += r.hi
+	return r.hi
+}
+
+func randomNumerators() []uint32 {
+	numerators := make([]uint32, numeratorsSize)
+	random := &randstate{2147483563, 2147483563 ^ 0x49616E42}
+	for i := range numerators {
+		numerators[i] = random.rand()
+	}
+	return numerators
+}
+
+func bmUint32Div(divisor uint32, b *testing.B) {
+	var sum uint32
+	for i := 0; i < b.N; i++ {
+		sum += numerators[i&(numeratorsSize-1)] / divisor
+	}
+}
+
+func BenchmarkUint32Div7(b *testing.B)         { bmUint32Div(7, b) }
+func BenchmarkUint32Div37(b *testing.B)        { bmUint32Div(37, b) }
+func BenchmarkUint32Div123(b *testing.B)       { bmUint32Div(123, b) }
+func BenchmarkUint32Div763(b *testing.B)       { bmUint32Div(763, b) }
+func BenchmarkUint32Div1247(b *testing.B)      { bmUint32Div(1247, b) }
+func BenchmarkUint32Div9305(b *testing.B)      { bmUint32Div(9305, b) }
+func BenchmarkUint32Div13307(b *testing.B)     { bmUint32Div(13307, b) }
+func BenchmarkUint32Div52513(b *testing.B)     { bmUint32Div(52513, b) }
+func BenchmarkUint32Div60978747(b *testing.B)  { bmUint32Div(60978747, b) }
+func BenchmarkUint32Div106956295(b *testing.B) { bmUint32Div(106956295, b) }
+
+func bmUint32Mod(divisor uint32, b *testing.B) {
+	var sum uint32
+	for i := 0; i < b.N; i++ {
+		sum += numerators[i&(numeratorsSize-1)] % divisor
+	}
+}
+
+func BenchmarkUint32Mod7(b *testing.B)         { bmUint32Mod(7, b) }
+func BenchmarkUint32Mod37(b *testing.B)        { bmUint32Mod(37, b) }
+func BenchmarkUint32Mod123(b *testing.B)       { bmUint32Mod(123, b) }
+func BenchmarkUint32Mod763(b *testing.B)       { bmUint32Mod(763, b) }
+func BenchmarkUint32Mod1247(b *testing.B)      { bmUint32Mod(1247, b) }
+func BenchmarkUint32Mod9305(b *testing.B)      { bmUint32Mod(9305, b) }
+func BenchmarkUint32Mod13307(b *testing.B)     { bmUint32Mod(13307, b) }
+func BenchmarkUint32Mod52513(b *testing.B)     { bmUint32Mod(52513, b) }
+func BenchmarkUint32Mod60978747(b *testing.B)  { bmUint32Mod(60978747, b) }
+func BenchmarkUint32Mod106956295(b *testing.B) { bmUint32Mod(106956295, b) }
diff --git a/src/runtime/vlrt.c b/src/runtime/vlrt.c
new file mode 100644
index 000000000..cb0d14796
--- /dev/null
+++ b/src/runtime/vlrt.c
@@ -0,0 +1,914 @@
+// Inferno's libkern/vlrt-386.c
+// http://code.google.com/p/inferno-os/source/browse/libkern/vlrt-386.c
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// +build arm 386
+
+#include "textflag.h"
+
+/*
+ * C runtime for 64-bit divide, others.
+ *
+ * TODO(rsc): The simple functions are dregs--8c knows how
+ * to generate the code directly now.  Find and remove.
+ */
+
+void	runtime·panicdivide(void);
+
+typedef	unsigned long	ulong;
+typedef	unsigned int	uint;
+typedef	unsigned short	ushort;
+typedef	unsigned char	uchar;
+typedef	signed char	schar;
+
+#define	SIGN(n)	(1UL<<(n-1))
+
+typedef	struct	Vlong	Vlong;
+struct	Vlong
+{
+	ulong	lo;
+	ulong	hi;
+};
+
+typedef	union	Vlong64	Vlong64;
+union	Vlong64
+{
+	long long	v;
+	Vlong	v2;
+};
+
+void	runtime·abort(void);
+
+#pragma textflag NOSPLIT
+Vlong
+_addv(Vlong a, Vlong b)
+{
+	Vlong r;
+
+	r.lo = a.lo + b.lo;
+	r.hi = a.hi + b.hi;
+	if(r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_subv(Vlong a, Vlong b)
+{
+	Vlong r;
+
+	r.lo = a.lo - b.lo;
+	r.hi = a.hi - b.hi;
+	if(r.lo > a.lo)
+		r.hi--;
+	return r;
+}
+
+Vlong
+_d2v(double d)
+{
+	union { double d; Vlong vl; } x;
+	ulong xhi, xlo, ylo, yhi;
+	int sh;
+	Vlong y;
+
+	x.d = d;
+
+	xhi = (x.vl.hi & 0xfffff) | 0x100000;
+	xlo = x.vl.lo;
+	sh = 1075 - ((x.vl.hi >> 20) & 0x7ff);
+
+	ylo = 0;
+	yhi = 0;
+	if(sh >= 0) {
+		/* v = (hi||lo) >> sh */
+		if(sh < 32) {
+			if(sh == 0) {
+				ylo = xlo;
+				yhi = xhi;
+			} else {
+				ylo = (xlo >> sh) | (xhi << (32-sh));
+				yhi = xhi >> sh;
+			}
+		} else {
+			if(sh == 32) {
+				ylo = xhi;
+			} else
+			if(sh < 64) {
+				ylo = xhi >> (sh-32);
+			}
+		}
+	} else {
+		/* v = (hi||lo) << -sh */
+		sh = -sh;
+		if(sh <= 10) { /* NOTE: sh <= 11 on ARM??? */
+			ylo = xlo << sh;
+			yhi = (xhi << sh) | (xlo >> (32-sh));
+		} else {
+			/* overflow */
+			yhi = d;	/* causes something awful */
+		}
+	}
+	if(x.vl.hi & SIGN(32)) {
+		if(ylo != 0) {
+			ylo = -ylo;
+			yhi = ~yhi;
+		} else
+			yhi = -yhi;
+	}
+
+	y.hi = yhi;
+	y.lo = ylo;
+	return y;
+}
+
+Vlong
+_f2v(float f)
+{
+	return _d2v(f);
+}
+
+double
+_ul2d(ulong u)
+{
+	// compensate for bug in c
+	if(u & SIGN(32)) {
+		u ^= SIGN(32);
+		return 2147483648. + u;
+	}
+	return u;
+}
+
+double
+_v2d(Vlong x)
+{
+	if(x.hi & SIGN(32)) {
+		if(x.lo) {
+			x.lo = -x.lo;
+			x.hi = ~x.hi;
+		} else
+			x.hi = -x.hi;
+		return -(_ul2d(x.hi)*4294967296. + _ul2d(x.lo));
+	}
+	return (long)x.hi*4294967296. + x.lo;
+}
+
+float
+_v2f(Vlong x)
+{
+	return _v2d(x);
+}
+
+ulong	runtime·_div64by32(Vlong, ulong, ulong*);
+int	runtime·_mul64by32(Vlong*, Vlong, ulong);
+
+static void
+slowdodiv(Vlong num, Vlong den, Vlong *q, Vlong *r)
+{
+	ulong numlo, numhi, denhi, denlo, quohi, quolo, t;
+	int i;
+
+	numhi = num.hi;
+	numlo = num.lo;
+	denhi = den.hi;
+	denlo = den.lo;
+
+	/*
+	 * get a divide by zero
+	 */
+	if(denlo==0 && denhi==0) {
+		runtime·panicdivide();
+	}
+
+	/*
+	 * set up the divisor and find the number of iterations needed
+	 */
+	if(numhi >= SIGN(32)) {
+		quohi = SIGN(32);
+		quolo = 0;
+	} else {
+		quohi = numhi;
+		quolo = numlo;
+	}
+	i = 0;
+	while(denhi < quohi || (denhi == quohi && denlo < quolo)) {
+		denhi = (denhi<<1) | (denlo>>31);
+		denlo <<= 1;
+		i++;
+	}
+
+	quohi = 0;
+	quolo = 0;
+	for(; i >= 0; i--) {
+		quohi = (quohi<<1) | (quolo>>31);
+		quolo <<= 1;
+		if(numhi > denhi || (numhi == denhi && numlo >= denlo)) {
+			t = numlo;
+			numlo -= denlo;
+			if(numlo > t)
+				numhi--;
+			numhi -= denhi;
+			quolo |= 1;
+		}
+		denlo = (denlo>>1) | (denhi<<31);
+		denhi >>= 1;
+	}
+
+	if(q) {
+		q->lo = quolo;
+		q->hi = quohi;
+	}
+	if(r) {
+		r->lo = numlo;
+		r->hi = numhi;
+	}
+}
+
+#ifdef GOARCH_arm
+static void
+dodiv(Vlong num, Vlong den, Vlong *qp, Vlong *rp)
+{
+	slowdodiv(num, den, qp, rp);
+}
+#endif
+
+#ifdef GOARCH_386
+static void
+dodiv(Vlong num, Vlong den, Vlong *qp, Vlong *rp)
+{
+	ulong n;
+	Vlong x, q, r;
+	
+	if(den.hi > num.hi || (den.hi == num.hi && den.lo > num.lo)){
+		if(qp) {
+			qp->hi = 0;
+			qp->lo = 0;
+		}
+		if(rp) {
+			rp->hi = num.hi;
+			rp->lo = num.lo;
+		}
+		return;
+	}
+
+	if(den.hi != 0){
+		q.hi = 0;
+		n = num.hi/den.hi;
+		if(runtime·_mul64by32(&x, den, n) || x.hi > num.hi || (x.hi == num.hi && x.lo > num.lo))
+			slowdodiv(num, den, &q, &r);
+		else {
+			q.lo = n;
+			*(long long*)&r = *(long long*)&num - *(long long*)&x;
+		}
+	} else {
+		if(num.hi >= den.lo){
+			if(den.lo == 0)
+				runtime·panicdivide();
+			q.hi = n = num.hi/den.lo;
+			num.hi -= den.lo*n;
+		} else {
+			q.hi = 0;
+		}
+		q.lo = runtime·_div64by32(num, den.lo, &r.lo);
+		r.hi = 0;
+	}
+	if(qp) {
+		qp->lo = q.lo;
+		qp->hi = q.hi;
+	}
+	if(rp) {
+		rp->lo = r.lo;
+		rp->hi = r.hi;
+	}
+}
+#endif
+
+Vlong
+_divvu(Vlong n, Vlong d)
+{
+	Vlong q;
+
+	if(n.hi == 0 && d.hi == 0) {
+		if(d.lo == 0)
+			runtime·panicdivide();
+		q.hi = 0;
+		q.lo = n.lo / d.lo;
+		return q;
+	}
+	dodiv(n, d, &q, 0);
+	return q;
+}
+
+Vlong
+_modvu(Vlong n, Vlong d)
+{
+	Vlong r;
+
+	if(n.hi == 0 && d.hi == 0) {
+		if(d.lo == 0)
+			runtime·panicdivide();
+		r.hi = 0;
+		r.lo = n.lo % d.lo;
+		return r;
+	}
+	dodiv(n, d, 0, &r);
+	return r;
+}
+
+static void
+vneg(Vlong *v)
+{
+
+	if(v->lo == 0) {
+		v->hi = -v->hi;
+		return;
+	}
+	v->lo = -v->lo;
+	v->hi = ~v->hi;
+}
+
+Vlong
+_divv(Vlong n, Vlong d)
+{
+	long nneg, dneg;
+	Vlong q;
+
+	if(n.hi == (((long)n.lo)>>31) && d.hi == (((long)d.lo)>>31)) {
+		if((long)n.lo == -0x80000000 && (long)d.lo == -1) {
+			// special case: 32-bit -0x80000000 / -1 causes divide error,
+			// but it's okay in this 64-bit context.
+			q.lo = 0x80000000;
+			q.hi = 0;
+			return q;
+		}
+		if(d.lo == 0)
+			runtime·panicdivide();
+		q.lo = (long)n.lo / (long)d.lo;
+		q.hi = ((long)q.lo) >> 31;
+		return q;
+	}
+	nneg = n.hi >> 31;
+	if(nneg)
+		vneg(&n);
+	dneg = d.hi >> 31;
+	if(dneg)
+		vneg(&d);
+	dodiv(n, d, &q, 0);
+	if(nneg != dneg)
+		vneg(&q);
+	return q;
+}
+
+Vlong
+_modv(Vlong n, Vlong d)
+{
+	long nneg, dneg;
+	Vlong r;
+
+	if(n.hi == (((long)n.lo)>>31) && d.hi == (((long)d.lo)>>31)) {
+		if((long)n.lo == -0x80000000 && (long)d.lo == -1) {
+			// special case: 32-bit -0x80000000 % -1 causes divide error,
+			// but it's okay in this 64-bit context.
+			r.lo = 0;
+			r.hi = 0;
+			return r;
+		}
+		if(d.lo == 0)
+			runtime·panicdivide();
+		r.lo = (long)n.lo % (long)d.lo;
+		r.hi = ((long)r.lo) >> 31;
+		return r;
+	}
+	nneg = n.hi >> 31;
+	if(nneg)
+		vneg(&n);
+	dneg = d.hi >> 31;
+	if(dneg)
+		vneg(&d);
+	dodiv(n, d, 0, &r);
+	if(nneg)
+		vneg(&r);
+	return r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_rshav(Vlong a, int b)
+{
+	long t;
+	Vlong r;
+
+	t = a.hi;
+	if(b >= 32) {
+		r.hi = t>>31;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r.lo = t>>31;
+			return r;
+		}
+		r.lo = t >> (b-32);
+		return r;
+	}
+	if(b <= 0) {
+		r.hi = t;
+		r.lo = a.lo;
+		return r;
+	}
+	r.hi = t >> b;
+	r.lo = (t << (32-b)) | (a.lo >> b);
+	return r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_rshlv(Vlong a, int b)
+{
+	ulong t;
+	Vlong r;
+
+	t = a.hi;
+	if(b >= 32) {
+		r.hi = 0;
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			r.lo = 0;
+			return r;
+		}
+		r.lo = t >> (b-32);
+		return r;
+	}
+	if(b <= 0) {
+		r.hi = t;
+		r.lo = a.lo;
+		return r;
+	}
+	r.hi = t >> b;
+	r.lo = (t << (32-b)) | (a.lo >> b);
+	return r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_lshv(Vlong a, int b)
+{
+	ulong t;
+
+	t = a.lo;
+	if(b >= 32) {
+		if(b >= 64) {
+			/* this is illegal re C standard */
+			return (Vlong){0, 0};
+		}
+		return (Vlong){0, t<<(b-32)};
+	}
+	if(b <= 0) {
+		return (Vlong){t, a.hi};
+	}
+	return (Vlong){t<<b, (t >> (32-b)) | (a.hi << b)};
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_andv(Vlong a, Vlong b)
+{
+	Vlong r;
+
+	r.hi = a.hi & b.hi;
+	r.lo = a.lo & b.lo;
+	return r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_orv(Vlong a, Vlong b)
+{
+	Vlong r;
+
+	r.hi = a.hi | b.hi;
+	r.lo = a.lo | b.lo;
+	return r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_xorv(Vlong a, Vlong b)
+{
+	Vlong r;
+
+	r.hi = a.hi ^ b.hi;
+	r.lo = a.lo ^ b.lo;
+	return r;
+}
+
+Vlong
+_vpp(Vlong *r)
+{
+	Vlong l;
+
+	l = *r;
+	r->lo++;
+	if(r->lo == 0)
+		r->hi++;
+	return l;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_vmm(Vlong *r)
+{
+	Vlong l;
+
+	l = *r;
+	if(r->lo == 0)
+		r->hi--;
+	r->lo--;
+	return l;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_ppv(Vlong *r)
+{
+
+	r->lo++;
+	if(r->lo == 0)
+		r->hi++;
+	return *r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_mmv(Vlong *r)
+{
+
+	if(r->lo == 0)
+		r->hi--;
+	r->lo--;
+	return *r;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_vasop(void *lv, Vlong fn(Vlong, Vlong), int type, Vlong rv)
+{
+	Vlong t, u;
+
+	u.lo = 0;
+	u.hi = 0;
+	switch(type) {
+	default:
+		runtime·abort();
+		break;
+
+	case 1:	/* schar */
+		t.lo = *(schar*)lv;
+		t.hi = t.lo >> 31;
+		u = fn(t, rv);
+		*(schar*)lv = u.lo;
+		break;
+
+	case 2:	/* uchar */
+		t.lo = *(uchar*)lv;
+		t.hi = 0;
+		u = fn(t, rv);
+		*(uchar*)lv = u.lo;
+		break;
+
+	case 3:	/* short */
+		t.lo = *(short*)lv;
+		t.hi = t.lo >> 31;
+		u = fn(t, rv);
+		*(short*)lv = u.lo;
+		break;
+
+	case 4:	/* ushort */
+		t.lo = *(ushort*)lv;
+		t.hi = 0;
+		u = fn(t, rv);
+		*(ushort*)lv = u.lo;
+		break;
+
+	case 9:	/* int */
+		t.lo = *(int*)lv;
+		t.hi = t.lo >> 31;
+		u = fn(t, rv);
+		*(int*)lv = u.lo;
+		break;
+
+	case 10:	/* uint */
+		t.lo = *(uint*)lv;
+		t.hi = 0;
+		u = fn(t, rv);
+		*(uint*)lv = u.lo;
+		break;
+
+	case 5:	/* long */
+		t.lo = *(long*)lv;
+		t.hi = t.lo >> 31;
+		u = fn(t, rv);
+		*(long*)lv = u.lo;
+		break;
+
+	case 6:	/* ulong */
+		t.lo = *(ulong*)lv;
+		t.hi = 0;
+		u = fn(t, rv);
+		*(ulong*)lv = u.lo;
+		break;
+
+	case 7:	/* vlong */
+	case 8:	/* uvlong */
+		if((void*)fn == _lshv || (void*)fn == _rshav || (void*)fn == _rshlv)
+			u = ((Vlong(*)(Vlong,int))fn)(*(Vlong*)lv, *(int*)&rv);
+		else
+			u = fn(*(Vlong*)lv, rv);
+		*(Vlong*)lv = u;
+		break;
+	}
+	return u;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_p2v(void *p)
+{
+	long t;
+	Vlong ret;
+
+	t = (ulong)p;
+	ret.lo = t;
+	ret.hi = 0;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_sl2v(long sl)
+{
+	long t;
+	Vlong ret;
+
+	t = sl;
+	ret.lo = t;
+	ret.hi = t >> 31;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_ul2v(ulong ul)
+{
+	long t;
+	Vlong ret;
+
+	t = ul;
+	ret.lo = t;
+	ret.hi = 0;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_si2v(int si)
+{
+	return (Vlong){si, si>>31};
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_ui2v(uint ui)
+{
+	long t;
+	Vlong ret;
+
+	t = ui;
+	ret.lo = t;
+	ret.hi = 0;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_sh2v(long sh)
+{
+	long t;
+	Vlong ret;
+
+	t = (sh << 16) >> 16;
+	ret.lo = t;
+	ret.hi = t >> 31;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_uh2v(ulong ul)
+{
+	long t;
+	Vlong ret;
+
+	t = ul & 0xffff;
+	ret.lo = t;
+	ret.hi = 0;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_sc2v(long uc)
+{
+	long t;
+	Vlong ret;
+
+	t = (uc << 24) >> 24;
+	ret.lo = t;
+	ret.hi = t >> 31;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+Vlong
+_uc2v(ulong ul)
+{
+	long t;
+	Vlong ret;
+
+	t = ul & 0xff;
+	ret.lo = t;
+	ret.hi = 0;
+	return ret;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2sc(Vlong rv)
+{
+	long t;
+
+	t = rv.lo & 0xff;
+	return (t << 24) >> 24;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2uc(Vlong rv)
+{
+
+	return rv.lo & 0xff;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2sh(Vlong rv)
+{
+	long t;
+
+	t = rv.lo & 0xffff;
+	return (t << 16) >> 16;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2uh(Vlong rv)
+{
+
+	return rv.lo & 0xffff;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2sl(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2ul(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2si(Vlong rv)
+{
+	return rv.lo;
+}
+
+#pragma textflag NOSPLIT
+long
+_v2ui(Vlong rv)
+{
+
+	return rv.lo;
+}
+
+#pragma textflag NOSPLIT
+int
+_testv(Vlong rv)
+{
+	return rv.lo || rv.hi;
+}
+
+#pragma textflag NOSPLIT
+int
+_eqv(Vlong lv, Vlong rv)
+{
+	return lv.lo == rv.lo && lv.hi == rv.hi;
+}
+
+#pragma textflag NOSPLIT
+int
+_nev(Vlong lv, Vlong rv)
+{
+	return lv.lo != rv.lo || lv.hi != rv.hi;
+}
+
+#pragma textflag NOSPLIT
+int
+_ltv(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi < (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo < rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_lev(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi < (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo <= rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_gtv(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi > (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo > rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_gev(Vlong lv, Vlong rv)
+{
+	return (long)lv.hi > (long)rv.hi ||
+		(lv.hi == rv.hi && lv.lo >= rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_lov(Vlong lv, Vlong rv)
+{
+	return lv.hi < rv.hi ||
+		(lv.hi == rv.hi && lv.lo < rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_lsv(Vlong lv, Vlong rv)
+{
+	return lv.hi < rv.hi ||
+		(lv.hi == rv.hi && lv.lo <= rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_hiv(Vlong lv, Vlong rv)
+{
+	return lv.hi > rv.hi ||
+		(lv.hi == rv.hi && lv.lo > rv.lo);
+}
+
+#pragma textflag NOSPLIT
+int
+_hsv(Vlong lv, Vlong rv)
+{
+	return lv.hi > rv.hi ||
+		(lv.hi == rv.hi && lv.lo >= rv.lo);
+}
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
new file mode 100644
index 000000000..6370732ca
--- /dev/null
+++ b/src/runtime/vlrt.go
@@ -0,0 +1,258 @@
+// Inferno's libkern/vlrt-arm.c
+// http://code.google.com/p/inferno-os/source/browse/libkern/vlrt-arm.c
+//
+//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
+//         Portions Copyright 2009 The Go Authors. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// +build arm 386
+
+package runtime
+
+import "unsafe"
+
+const (
+	sign32 = 1 << (32 - 1)
+	sign64 = 1 << (64 - 1)
+)
+
+func float64toint64(d float64) (y uint64) {
+	_d2v(&y, d)
+	return
+}
+
+func float64touint64(d float64) (y uint64) {
+	_d2v(&y, d)
+	return
+}
+
+func int64tofloat64(y int64) float64 {
+	if y < 0 {
+		return -uint64tofloat64(-uint64(y))
+	}
+	return uint64tofloat64(uint64(y))
+}
+
+func uint64tofloat64(y uint64) float64 {
+	hi := float64(uint32(y >> 32))
+	lo := float64(uint32(y))
+	d := hi*(1<<32) + lo
+	return d
+}
+
+func _d2v(y *uint64, d float64) {
+	x := *(*uint64)(unsafe.Pointer(&d))
+
+	xhi := uint32(x>>32)&0xfffff | 0x100000
+	xlo := uint32(x)
+	sh := 1075 - int32(uint32(x>>52)&0x7ff)
+
+	var ylo, yhi uint32
+	if sh >= 0 {
+		sh := uint32(sh)
+		/* v = (hi||lo) >> sh */
+		if sh < 32 {
+			if sh == 0 {
+				ylo = xlo
+				yhi = xhi
+			} else {
+				ylo = xlo>>sh | xhi<<(32-sh)
+				yhi = xhi >> sh
+			}
+		} else {
+			if sh == 32 {
+				ylo = xhi
+			} else if sh < 64 {
+				ylo = xhi >> (sh - 32)
+			}
+		}
+	} else {
+		/* v = (hi||lo) << -sh */
+		sh := uint32(-sh)
+		if sh <= 11 {
+			ylo = xlo << sh
+			yhi = xhi<<sh | xlo>>(32-sh)
+		} else {
+			/* overflow */
+			yhi = uint32(d) /* causes something awful */
+		}
+	}
+	if x&sign64 != 0 {
+		if ylo != 0 {
+			ylo = -ylo
+			yhi = ^yhi
+		} else {
+			yhi = -yhi
+		}
+	}
+
+	*y = uint64(yhi)<<32 | uint64(ylo)
+}
+
+func uint64div(n, d uint64) uint64 {
+	// Check for 32 bit operands
+	if uint32(n>>32) == 0 && uint32(d>>32) == 0 {
+		if uint32(d) == 0 {
+			panicdivide()
+		}
+		return uint64(uint32(n) / uint32(d))
+	}
+	q, _ := dodiv(n, d)
+	return q
+}
+
+func uint64mod(n, d uint64) uint64 {
+	// Check for 32 bit operands
+	if uint32(n>>32) == 0 && uint32(d>>32) == 0 {
+		if uint32(d) == 0 {
+			panicdivide()
+		}
+		return uint64(uint32(n) % uint32(d))
+	}
+	_, r := dodiv(n, d)
+	return r
+}
+
+func int64div(n, d int64) int64 {
+	// Check for 32 bit operands
+	if int64(int32(n)) == n && int64(int32(d)) == d {
+		if int32(n) == -0x80000000 && int32(d) == -1 {
+			// special case: 32-bit -0x80000000 / -1 = -0x80000000,
+			// but 64-bit -0x80000000 / -1 = 0x80000000.
+			return 0x80000000
+		}
+		if int32(d) == 0 {
+			panicdivide()
+		}
+		return int64(int32(n) / int32(d))
+	}
+
+	nneg := n < 0
+	dneg := d < 0
+	if nneg {
+		n = -n
+	}
+	if dneg {
+		d = -d
+	}
+	uq, _ := dodiv(uint64(n), uint64(d))
+	q := int64(uq)
+	if nneg != dneg {
+		q = -q
+	}
+	return q
+}
+
+func int64mod(n, d int64) int64 {
+	// Check for 32 bit operands
+	if int64(int32(n)) == n && int64(int32(d)) == d {
+		if int32(d) == 0 {
+			panicdivide()
+		}
+		return int64(int32(n) % int32(d))
+	}
+
+	nneg := n < 0
+	if nneg {
+		n = -n
+	}
+	if d < 0 {
+		d = -d
+	}
+	_, ur := dodiv(uint64(n), uint64(d))
+	r := int64(ur)
+	if nneg {
+		r = -r
+	}
+	return r
+}
+
+//go:noescape
+func _mul64by32(lo64 *uint64, a uint64, b uint32) (hi32 uint32)
+
+//go:noescape
+func _div64by32(a uint64, b uint32, r *uint32) (q uint32)
+
+func dodiv(n, d uint64) (q, r uint64) {
+	if GOARCH == "arm" {
+		// arm doesn't have a division instruction, so
+		// slowdodiv is the best that we can do.
+		// TODO: revisit for arm64.
+		return slowdodiv(n, d)
+	}
+
+	if d > n {
+		return 0, n
+	}
+
+	if uint32(d>>32) != 0 {
+		t := uint32(n>>32) / uint32(d>>32)
+		var lo64 uint64
+		hi32 := _mul64by32(&lo64, d, t)
+		if hi32 != 0 || lo64 > n {
+			return slowdodiv(n, d)
+		}
+		return uint64(t), n - lo64
+	}
+
+	// d is 32 bit
+	var qhi uint32
+	if uint32(n>>32) >= uint32(d) {
+		if uint32(d) == 0 {
+			panicdivide()
+		}
+		qhi = uint32(n>>32) / uint32(d)
+		n -= uint64(uint32(d)*qhi) << 32
+	} else {
+		qhi = 0
+	}
+
+	var rlo uint32
+	qlo := _div64by32(n, uint32(d), &rlo)
+	return uint64(qhi)<<32 + uint64(qlo), uint64(rlo)
+}
+
+func slowdodiv(n, d uint64) (q, r uint64) {
+	if d == 0 {
+		panicdivide()
+	}
+
+	// Set up the divisor and find the number of iterations needed.
+	capn := n
+	if n >= sign64 {
+		capn = sign64
+	}
+	i := 0
+	for d < capn {
+		d <<= 1
+		i++
+	}
+
+	for ; i >= 0; i-- {
+		q <<= 1
+		if n >= d {
+			n -= d
+			q |= 1
+		}
+		d >>= 1
+	}
+	return q, n
+}
author	Tianon Gravi <admwiggin@gmail.com>	2015-01-15 11:54:00 -0700
committer	Tianon Gravi <admwiggin@gmail.com>	2015-01-15 11:54:00 -0700
commit	f154da9e12608589e8d5f0508f908a0c3e88a1bb (patch)
tree	f8255d51e10c6f1e0ed69702200b966c9556a431 /src/runtime
parent	8d8329ed5dfb9622c82a9fbec6fd99a580f9c9f6 (diff)
download	golang-f154da9e12608589e8d5f0508f908a0c3e88a1bb.tar.gz