Imported Upstream version 1.1~hg20130304upstream/1.1_hg20130304

author: Michael Stapelberg <stapelberg@debian.org> 2013-03-04 21:27:36 +0100
committer: Michael Stapelberg <michael@stapelberg.de> 2013-03-04 21:27:36 +0100
commit: 04b08da9af0c450d645ab7389d1467308cfc2db8 (patch)
tree: db247935fa4f2f94408edc3acd5d0d4f997aa0d8 /src/pkg/runtime
parent: 917c5fb8ec48e22459d77e3849e6d388f93d3260 (diff)
download: golang-upstream/1.1_hg20130304.tar.gz
229 files changed, 20551 insertions, 5742 deletions
diff --git a/src/pkg/runtime/alg.c b/src/pkg/runtime/alg.c
index 36973eba3..ad85b43ae 100644
--- a/src/pkg/runtime/alg.c
+++ b/src/pkg/runtime/alg.c
@@ -19,13 +19,13 @@ runtime·memhash(uintptr *h, uintptr s, void *a)
 	uintptr hash;
 
 	b = a;
-	hash = M0;
+	hash = M0 ^ *h;
 	while(s > 0) {
 		hash = (hash ^ *b) * M1;
 		b++;
 		s--;
 	}
-	*h = (*h ^ hash) * M1;
+	*h = hash;
 }
 
 void
@@ -316,7 +316,7 @@ runtime·strhash(uintptr *h, uintptr s, void *a)
 void
 runtime·strequal(bool *eq, uintptr s, void *a, void *b)
 {
-	int32 alen;
+	intgo alen;
 
 	USED(s);
 	alen = ((String*)a)->len;
@@ -324,6 +324,10 @@ runtime·strequal(bool *eq, uintptr s, void *a, void *b)
 		*eq = false;
 		return;
 	}
+	if(((String*)a)->str == ((String*)b)->str) {
+		*eq = true;
+		return;
+	}
 	runtime·memequal(eq, alen, ((String*)a)->str, ((String*)b)->str);
 }
 
@@ -351,7 +355,7 @@ void
 runtime·interhash(uintptr *h, uintptr s, void *a)
 {
 	USED(s);
-	*h = (*h ^ runtime·ifacehash(*(Iface*)a)) * M1;
+	*h = runtime·ifacehash(*(Iface*)a, *h ^ M0) * M1;
 }
 
 void
@@ -385,7 +389,7 @@ void
 runtime·nilinterhash(uintptr *h, uintptr s, void *a)
 {
 	USED(s);
-	*h = (*h ^ runtime·efacehash(*(Eface*)a)) * M1;
+	*h = runtime·efacehash(*(Eface*)a, *h ^ M0) * M1;
 }
 
 void
@@ -469,10 +473,11 @@ void
 runtime·equal(Type *t, ...)
 {
 	byte *x, *y;
-	bool *ret;
+	uintptr ret;
 	
 	x = (byte*)(&t+1);
 	y = x + t->size;
-	ret = (bool*)(y + t->size);
-	t->alg->equal(ret, t->size, x, y);
+	ret = (uintptr)(y + t->size);
+	ret = ROUND(ret, Structrnd);
+	t->alg->equal((bool*)ret, t->size, x, y);
 }
diff --git a/src/pkg/runtime/append_test.go b/src/pkg/runtime/append_test.go
index b8552224e..36390181e 100644
--- a/src/pkg/runtime/append_test.go
+++ b/src/pkg/runtime/append_test.go
@@ -19,6 +19,67 @@ func BenchmarkAppend(b *testing.B) {
 	}
 }
 
+func benchmarkAppendBytes(b *testing.B, length int) {
+	b.StopTimer()
+	x := make([]byte, 0, N)
+	y := make([]byte, length)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		x = x[0:0]
+		x = append(x, y...)
+	}
+}
+
+func BenchmarkAppend1Byte(b *testing.B) {
+	benchmarkAppendBytes(b, 1)
+}
+
+func BenchmarkAppend4Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 4)
+}
+
+func BenchmarkAppend8Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 8)
+}
+
+func BenchmarkAppend16Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 16)
+}
+
+func BenchmarkAppend32Bytes(b *testing.B) {
+	benchmarkAppendBytes(b, 32)
+}
+
+func benchmarkAppendStr(b *testing.B, str string) {
+	b.StopTimer()
+	x := make([]byte, 0, N)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		x = x[0:0]
+		x = append(x, str...)
+	}
+}
+
+func BenchmarkAppendStr1Byte(b *testing.B) {
+	benchmarkAppendStr(b, "1")
+}
+
+func BenchmarkAppendStr4Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "1234")
+}
+
+func BenchmarkAppendStr8Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "12345678")
+}
+
+func BenchmarkAppendStr16Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "1234567890123456")
+}
+
+func BenchmarkAppendStr32Bytes(b *testing.B) {
+	benchmarkAppendStr(b, "12345678901234567890123456789012")
+}
+
 func BenchmarkAppendSpecialCase(b *testing.B) {
 	b.StopTimer()
 	x := make([]int, 0, N)
@@ -50,3 +111,13 @@ func TestSideEffectOrder(t *testing.T) {
 		t.Error("append failed: ", x[0], x[1])
 	}
 }
+
+func TestAppendOverlap(t *testing.T) {
+	x := []byte("1234")
+	x = append(x[1:], x...) // p > q in runtime·appendslice.
+	got := string(x)
+	want := "2341234"
+	if got != want {
+		t.Errorf("overlap failed: got %q want %q", got, want)
+	}
+}
diff --git a/src/pkg/runtime/arch_386.h b/src/pkg/runtime/arch_386.h
index a0798f99e..4df795f71 100644
--- a/src/pkg/runtime/arch_386.h
+++ b/src/pkg/runtime/arch_386.h
@@ -1,4 +1,6 @@
 enum {
 	thechar = '8',
-	CacheLineSize = 64
+	BigEndian = 0,
+	CacheLineSize = 64,
+	appendCrossover = 16
 };
diff --git a/src/pkg/runtime/arch_amd64.h b/src/pkg/runtime/arch_amd64.h
index dd1cfc18d..e83dc9105 100644
--- a/src/pkg/runtime/arch_amd64.h
+++ b/src/pkg/runtime/arch_amd64.h
@@ -1,4 +1,6 @@
 enum {
 	thechar = '6',
-	CacheLineSize = 64
+	BigEndian = 0,
+	CacheLineSize = 64,
+	appendCrossover = 16
 };
diff --git a/src/pkg/runtime/arch_arm.h b/src/pkg/runtime/arch_arm.h
index c1a7a0f37..f6af58514 100644
--- a/src/pkg/runtime/arch_arm.h
+++ b/src/pkg/runtime/arch_arm.h
@@ -1,4 +1,6 @@
 enum {
 	thechar = '5',
-	CacheLineSize = 32
+	BigEndian = 0,
+	CacheLineSize = 32,
+	appendCrossover = 16
 };
diff --git a/src/pkg/runtime/asm_386.s b/src/pkg/runtime/asm_386.s
index 21bd293ab..96f04e0ae 100644
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@@ -14,22 +14,22 @@ TEXT _rt0_386(SB),7,$0
 	MOVL	BX, 124(SP)
 
 	// set default stack bounds.
-	// initcgo may update stackguard.
+	// _cgo_init may update stackguard.
 	MOVL	$runtime·g0(SB), BP
 	LEAL	(-64*1024+104)(SP), BX
 	MOVL	BX, g_stackguard(BP)
 	MOVL	SP, g_stackbase(BP)
 	
-	// if there is an initcgo, call it to let it
+	// if there is an _cgo_init, call it to let it
 	// initialize and to set up GS.  if not,
 	// we set up GS ourselves.
-	MOVL	initcgo(SB), AX
+	MOVL	_cgo_init(SB), AX
 	TESTL	AX, AX
 	JZ	needtls
 	PUSHL	BP
 	CALL	AX
 	POPL	BP
-	// skip runtime·ldt0setup(SB) and tls test after initcgo for non-windows
+	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
 	CMPL runtime·iswindows(SB), $0
 	JEQ ok
 needtls:
@@ -75,7 +75,7 @@ ok:
 	CALL	runtime·schedinit(SB)
 
 	// create a new goroutine to start program
-	PUSHL	$runtime·main(SB)	// entry
+	PUSHL	$runtime·main·f(SB)	// entry
 	PUSHL	$0	// arg size
 	CALL	runtime·newproc(SB)
 	POPL	AX
@@ -87,6 +87,9 @@ ok:
 	INT $3
 	RET
 
+DATA	runtime·main·f+0(SB)/4,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),8,$4
+
 TEXT runtime·breakpoint(SB),7,$0
 	INT $3
 	RET
@@ -131,22 +134,40 @@ TEXT runtime·gogo(SB), 7, $0
 	MOVL	gobuf_pc(BX), BX
 	JMP	BX
 
-// void gogocall(Gobuf*, void (*fn)(void))
+// void gogocall(Gobuf*, void (*fn)(void), uintptr r0)
 // restore state from Gobuf but then call fn.
 // (call fn, returning to state in Gobuf)
 TEXT runtime·gogocall(SB), 7, $0
+	MOVL	12(SP), DX	// context
 	MOVL	8(SP), AX		// fn
 	MOVL	4(SP), BX		// gobuf
-	MOVL	gobuf_g(BX), DX
+	MOVL	gobuf_g(BX), DI
 	get_tls(CX)
-	MOVL	DX, g(CX)
-	MOVL	0(DX), CX		// make sure g != nil
+	MOVL	DI, g(CX)
+	MOVL	0(DI), CX		// make sure g != nil
 	MOVL	gobuf_sp(BX), SP	// restore SP
 	MOVL	gobuf_pc(BX), BX
 	PUSHL	BX
 	JMP	AX
 	POPL	BX	// not reached
 
+// void gogocallfn(Gobuf*, FuncVal*)
+// restore state from Gobuf but then call fn.
+// (call fn, returning to state in Gobuf)
+TEXT runtime·gogocallfn(SB), 7, $0
+	MOVL	8(SP), DX		// fn
+	MOVL	4(SP), BX		// gobuf
+	MOVL	gobuf_g(BX), DI
+	get_tls(CX)
+	MOVL	DI, g(CX)
+	MOVL	0(DI), CX		// make sure g != nil
+	MOVL	gobuf_sp(BX), SP	// restore SP
+	MOVL	gobuf_pc(BX), BX
+	PUSHL	BX
+	MOVL	0(DX), BX
+	JMP	BX
+	POPL	BX	// not reached
+
 // void mcall(void (*fn)(G*))
 // Switch to m->g0's stack, call fn(g).
 // Fn must never return.  It should gogo(&g->sched)
@@ -189,11 +210,13 @@ TEXT runtime·morestack(SB),7,$0
 	CMPL	g(CX), SI
 	JNE	2(PC)
 	INT	$3
+	
+	MOVL	DX, m_cret(BX)
 
-	// frame size in DX
+	// frame size in DI
 	// arg size in AX
 	// Save in m.
-	MOVL	DX, m_moreframesize(BX)
+	MOVL	DI, m_moreframesize(BX)
 	MOVL	AX, m_moreargsize(BX)
 
 	// Called from f.
@@ -299,6 +322,33 @@ TEXT runtime·cas(SB), 7, $0
 	MOVL	$1, AX
 	RET
 
+// bool runtime·cas64(uint64 *val, uint64 *old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		*old = *val
+//		return 0;
+//	}
+TEXT runtime·cas64(SB), 7, $0
+	MOVL	4(SP), BP
+	MOVL	8(SP), SI
+	MOVL	0(SI), AX
+	MOVL	4(SI), DX
+	MOVL	12(SP), BX
+	MOVL	16(SP), CX
+	LOCK
+	CMPXCHG8B	0(BP)
+	JNZ	cas64_fail
+	MOVL	$1, AX
+	RET
+cas64_fail:
+	MOVL	AX, 0(SI)
+	MOVL	DX, 4(SI)
+	MOVL	$0, AX
+	RET
+
 // bool casp(void **p, void *old, void *new)
 // Atomically:
 //	if(*p == old){
@@ -357,17 +407,49 @@ TEXT runtime·atomicstore(SB), 7, $0
 	XCHGL	AX, 0(BX)
 	RET
 
+// uint64 atomicload64(uint64 volatile* addr);
+// so actually
+// void atomicload64(uint64 *res, uint64 volatile *addr);
+TEXT runtime·atomicload64(SB), 7, $0
+	MOVL    4(SP), BX
+	MOVL	8(SP), AX
+	// MOVQ (%EAX), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
+	// MOVQ %MM0, 0(%EBX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	RET
+
+// void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
+TEXT runtime·atomicstore64(SB), 7, $0
+	MOVL	4(SP), AX
+	// MOVQ and EMMS were introduced on the Pentium MMX.
+	// MOVQ 0x8(%ESP), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
+	// MOVQ %MM0, (%EAX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	// This is essentially a no-op, but it provides required memory fencing.
+	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
+	MOVL	$0, AX
+	LOCK
+	XADDL	AX, (SP)
+	RET
+
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
 // 2. sub 5 bytes from the callers return
 // 3. jmp to the argument
 TEXT runtime·jmpdefer(SB), 7, $0
-	MOVL	4(SP), AX	// fn
+	MOVL	4(SP), DX	// fn
 	MOVL	8(SP), BX	// caller sp
 	LEAL	-4(BX), SP	// caller sp after CALL
 	SUBL	$5, (SP)	// return to CALL again
-	JMP	AX	// but first run the deferred function
+	MOVL	0(DX), BX
+	JMP	BX	// but first run the deferred function
 
 // Dummy function to use in saved gobuf.PC,
 // to match SP pointing at a return address.
@@ -416,23 +498,49 @@ TEXT runtime·asmcgocall(SB),7,$0
 	RET
 
 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
-// See cgocall.c for more details.
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
 TEXT runtime·cgocallback(SB),7,$12
-	MOVL	fn+0(FP), AX
-	MOVL	frame+4(FP), BX
-	MOVL	framesize+8(FP), DX
+	LEAL	fn+0(FP), AX
+	MOVL	AX, 0(SP)
+	MOVL	frame+4(FP), AX
+	MOVL	AX, 4(SP)
+	MOVL	framesize+8(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	$runtime·cgocallback_gofunc(SB), AX
+	CALL	AX
+	RET
 
-	// Save current m->g0->sched.sp on stack and then set it to SP.
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT runtime·cgocallback_gofunc(SB),7,$12
+	// If m is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call through AX.
 	get_tls(CX)
+#ifdef GOOS_windows
+	CMPL	CX, $0
+	JNE	3(PC)
+	PUSHL	$0
+	JMP needm
+#endif
 	MOVL	m(CX), BP
-
-	// If m is nil, it is almost certainly because we have been called
-	// on a thread that Go did not create.  We're going to crash as
-	// soon as we try to use m; instead, try to print a nice error and exit.
+	PUSHL	BP
 	CMPL	BP, $0
-	JNE 2(PC)
-	CALL	runtime·badcallback(SB)
+	JNE	havem
+needm:
+	MOVL	$runtime·needm(SB), AX
+	CALL	AX
+	get_tls(CX)
+	MOVL	m(CX), BP
 
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
 	MOVL	m_g0(BP), SI
 	PUSHL	(g_sched+gobuf_sp)(SI)
 	MOVL	SP, (g_sched+gobuf_sp)(SI)
@@ -451,6 +559,10 @@ TEXT runtime·cgocallback(SB),7,$12
 	// a frame size of 12, the same amount that we use below),
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
+	MOVL	fn+0(FP), AX
+	MOVL	frame+4(FP), BX
+	MOVL	framesize+8(FP), DX
+
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
@@ -488,10 +600,38 @@ TEXT runtime·cgocallback(SB),7,$12
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), SP
 	POPL	(g_sched+gobuf_sp)(SI)
+	
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	POPL	BP
+	CMPL	BP, $0
+	JNE 3(PC)
+	MOVL	$runtime·dropm(SB), AX
+	CALL	AX
 
 	// Done!
 	RET
 
+// void setmg(M*, G*); set m and g. for use by needm.
+TEXT runtime·setmg(SB), 7, $0
+#ifdef GOOS_windows
+	MOVL	mm+0(FP), AX
+	CMPL	AX, $0
+	JNE	settls
+	MOVL	$0, 0x14(FS)
+	RET
+settls:
+	LEAL	m_tls(AX), AX
+	MOVL	AX, 0x14(FS)
+#endif
+	MOVL	mm+0(FP), AX
+	get_tls(CX)
+	MOVL	mm+0(FP), AX
+	MOVL	AX, m(CX)
+	MOVL	gg+4(FP), BX
+	MOVL	BX, g(CX)
+	RET
+
 // check that SP is in range [g->stackbase, g->stackguard)
 TEXT runtime·stackcheck(SB), 7, $0
 	get_tls(CX)
diff --git a/src/pkg/runtime/asm_amd64.s b/src/pkg/runtime/asm_amd64.s
index d41ab96d0..987958498 100644
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@@ -14,14 +14,14 @@ TEXT _rt0_amd64(SB),7,$-8
 	MOVQ	BX, 24(SP)
 	
 	// create istack out of the given (operating system) stack.
-	// initcgo may update stackguard.
+	// _cgo_init may update stackguard.
 	MOVQ	$runtime·g0(SB), DI
 	LEAQ	(-64*1024+104)(SP), BX
 	MOVQ	BX, g_stackguard(DI)
 	MOVQ	SP, g_stackbase(DI)
 
-	// if there is an initcgo, call it.
-	MOVQ	initcgo(SB), AX
+	// if there is an _cgo_init, call it.
+	MOVQ	_cgo_init(SB), AX
 	TESTQ	AX, AX
 	JZ	needtls
 	// g0 already in DI
@@ -31,6 +31,10 @@ TEXT _rt0_amd64(SB),7,$-8
 	JEQ ok
 
 needtls:
+	// skip TLS setup on Plan 9
+	CMPL	runtime·isplan9(SB), $1
+	JEQ ok
+
 	LEAQ	runtime·tls0(SB), DI
 	CALL	runtime·settls(SB)
 
@@ -64,7 +68,7 @@ ok:
 	CALL	runtime·schedinit(SB)
 
 	// create a new goroutine to start program
-	PUSHQ	$runtime·main(SB)		// entry
+	PUSHQ	$runtime·main·f(SB)		// entry
 	PUSHQ	$0			// arg size
 	CALL	runtime·newproc(SB)
 	POPQ	AX
@@ -76,6 +80,9 @@ ok:
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+DATA	runtime·main·f+0(SB)/8,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),8,$8
+
 TEXT runtime·breakpoint(SB),7,$0
 	BYTE	$0xcc
 	RET
@@ -114,22 +121,40 @@ TEXT runtime·gogo(SB), 7, $0
 	MOVQ	gobuf_pc(BX), BX
 	JMP	BX
 
-// void gogocall(Gobuf*, void (*fn)(void))
+// void gogocall(Gobuf*, void (*fn)(void), uintptr r0)
 // restore state from Gobuf but then call fn.
 // (call fn, returning to state in Gobuf)
 TEXT runtime·gogocall(SB), 7, $0
+	MOVQ	24(SP), DX	// context
 	MOVQ	16(SP), AX		// fn
 	MOVQ	8(SP), BX		// gobuf
-	MOVQ	gobuf_g(BX), DX
+	MOVQ	gobuf_g(BX), DI
 	get_tls(CX)
-	MOVQ	DX, g(CX)
-	MOVQ	0(DX), CX	// make sure g != nil
+	MOVQ	DI, g(CX)
+	MOVQ	0(DI), CX	// make sure g != nil
 	MOVQ	gobuf_sp(BX), SP	// restore SP
 	MOVQ	gobuf_pc(BX), BX
 	PUSHQ	BX
 	JMP	AX
 	POPQ	BX	// not reached
 
+// void gogocallfn(Gobuf*, FuncVal*)
+// restore state from Gobuf but then call fn.
+// (call fn, returning to state in Gobuf)
+TEXT runtime·gogocallfn(SB), 7, $0
+	MOVQ	16(SP), DX		// fn
+	MOVQ	8(SP), BX		// gobuf
+	MOVQ	gobuf_g(BX), AX
+	get_tls(CX)
+	MOVQ	AX, g(CX)
+	MOVQ	0(AX), CX	// make sure g != nil
+	MOVQ	gobuf_sp(BX), SP	// restore SP
+	MOVQ	gobuf_pc(BX), BX
+	PUSHQ	BX
+	MOVQ	0(DX), BX
+	JMP	BX
+	POPQ	BX	// not reached
+
 // void mcall(void (*fn)(G*))
 // Switch to m->g0's stack, call fn(g).
 // Fn must never return.  It should gogo(&g->sched)
@@ -171,6 +196,8 @@ TEXT runtime·morestack(SB),7,$0
 	CMPQ	g(CX), SI
 	JNE	2(PC)
 	INT	$3
+	
+	MOVQ	DX, m_cret(BX)
 
 	// Called from f.
 	// Set m->morebuf to f's caller.
@@ -344,6 +371,30 @@ TEXT runtime·cas(SB), 7, $0
 	MOVL	$1, AX
 	RET
 
+// bool	runtime·cas64(uint64 *val, uint64 *old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		*old = *val
+//		return 0;
+//	}
+TEXT runtime·cas64(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVQ	16(SP), BP
+	MOVQ	0(BP), AX
+	MOVQ	24(SP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	JNZ	cas64_fail
+	MOVL	$1, AX
+	RET
+cas64_fail:
+	MOVQ	AX, 0(BP)
+	MOVL	$0, AX
+	RET
+
 // bool casp(void **val, void *old, void *new)
 // Atomically:
 //	if(*val == old){
@@ -376,6 +427,15 @@ TEXT runtime·xadd(SB), 7, $0
 	ADDL	CX, AX
 	RET
 
+TEXT runtime·xadd64(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVQ	16(SP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	RET
+
 TEXT runtime·xchg(SB), 7, $0
 	MOVQ	8(SP), BX
 	MOVL	16(SP), AX
@@ -402,17 +462,24 @@ TEXT runtime·atomicstore(SB), 7, $0
 	XCHGL	AX, 0(BX)
 	RET
 
+TEXT runtime·atomicstore64(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVQ	16(SP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
 // 2. sub 5 bytes from the callers return
 // 3. jmp to the argument
 TEXT runtime·jmpdefer(SB), 7, $0
-	MOVQ	8(SP), AX	// fn
+	MOVQ	8(SP), DX	// fn
 	MOVQ	16(SP), BX	// caller sp
 	LEAQ	-8(BX), SP	// caller sp after CALL
 	SUBQ	$5, (SP)	// return to CALL again
-	JMP	AX	// but first run the deferred function
+	MOVQ	0(DX), BX
+	JMP	BX	// but first run the deferred function
 
 // Dummy function to use in saved gobuf.PC,
 // to match SP pointing at a return address.
@@ -446,39 +513,67 @@ TEXT runtime·asmcgocall(SB),7,$0
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
 
 	// Now on a scheduling stack (a pthread-created stack).
-	SUBQ	$48, SP
+	// Make sure we have enough room for 4 stack-backed fast-call
+	// registers as per windows amd64 calling convention.
+	SUBQ	$64, SP
 	ANDQ	$~15, SP	// alignment for gcc ABI
-	MOVQ	DI, 32(SP)	// save g
-	MOVQ	DX, 24(SP)	// save SP
+	MOVQ	DI, 48(SP)	// save g
+	MOVQ	DX, 40(SP)	// save SP
 	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
 	MOVQ	BX, CX		// CX = first argument in Win64
 	CALL	AX
 
 	// Restore registers, g, stack pointer.
 	get_tls(CX)
-	MOVQ	32(SP), DI
+	MOVQ	48(SP), DI
 	MOVQ	DI, g(CX)
-	MOVQ	24(SP), SP
+	MOVQ	40(SP), SP
 	RET
 
 // cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
-// See cgocall.c for more details.
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
 TEXT runtime·cgocallback(SB),7,$24
-	MOVQ	fn+0(FP), AX
-	MOVQ	frame+8(FP), BX
-	MOVQ	framesize+16(FP), DX
+	LEAQ	fn+0(FP), AX
+	MOVQ	AX, 0(SP)
+	MOVQ	frame+8(FP), AX
+	MOVQ	AX, 8(SP)
+	MOVQ	framesize+16(FP), AX
+	MOVQ	AX, 16(SP)
+	MOVQ	$runtime·cgocallback_gofunc(SB), AX
+	CALL	AX
+	RET
 
-	// Save current m->g0->sched.sp on stack and then set it to SP.
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT runtime·cgocallback_gofunc(SB),7,$24
+	// If m is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call through AX.
 	get_tls(CX)
+#ifdef GOOS_windows
+	CMPQ	CX, $0
+	JNE	3(PC)
+	PUSHQ	$0
+	JMP	needm
+#endif
 	MOVQ	m(CX), BP
-	
-	// If m is nil, it is almost certainly because we have been called
-	// on a thread that Go did not create.  We're going to crash as
-	// soon as we try to use m; instead, try to print a nice error and exit.
+	PUSHQ	BP
 	CMPQ	BP, $0
-	JNE 2(PC)
-	CALL	runtime·badcallback(SB)
+	JNE	havem
+needm:
+	MOVQ	$runtime·needm(SB), AX
+	CALL	AX
+	get_tls(CX)
+	MOVQ	m(CX), BP
 
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
 	MOVQ	m_g0(BP), SI
 	PUSHQ	(g_sched+gobuf_sp)(SI)
 	MOVQ	SP, (g_sched+gobuf_sp)(SI)
@@ -497,6 +592,10 @@ TEXT runtime·cgocallback(SB),7,$24
 	// a frame size of 24, the same amount that we use below),
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
+	MOVQ	fn+0(FP), AX
+	MOVQ	frame+8(FP), BX
+	MOVQ	framesize+16(FP), DX
+
 	MOVQ	m_curg(BP), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
@@ -534,10 +633,37 @@ TEXT runtime·cgocallback(SB),7,$24
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
 	POPQ	(g_sched+gobuf_sp)(SI)
+	
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	POPQ	BP
+	CMPQ	BP, $0
+	JNE 3(PC)
+	MOVQ	$runtime·dropm(SB), AX
+	CALL	AX
 
 	// Done!
 	RET
 
+// void setmg(M*, G*); set m and g. for use by needm.
+TEXT runtime·setmg(SB), 7, $0
+	MOVQ	mm+0(FP), AX
+#ifdef GOOS_windows
+	CMPQ	AX, $0
+	JNE	settls
+	MOVQ	$0, 0x28(GS)
+	RET
+settls:
+	LEAQ	m_tls(AX), AX
+	MOVQ	AX, 0x28(GS)
+#endif
+	get_tls(CX)
+	MOVQ	mm+0(FP), AX
+	MOVQ	AX, m(CX)
+	MOVQ	gg+8(FP), BX
+	MOVQ	BX, g(CX)
+	RET
+
 // check that SP is in range [g->stackbase, g->stackguard)
 TEXT runtime·stackcheck(SB), 7, $0
 	get_tls(CX)
diff --git a/src/pkg/runtime/asm_arm.s b/src/pkg/runtime/asm_arm.s
index 423fda7a0..45b53541b 100644
--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@@ -31,6 +31,13 @@ TEXT _rt0_arm(SB),7,$-4
 	MOVW	R13, g_stackbase(g)
 	BL	runtime·emptyfunc(SB)	// fault if stack check is wrong
 
+	// if there is an _cgo_init, call it.
+	MOVW	_cgo_init(SB), R2
+	CMP	$0, R2
+	MOVW.NE	g, R0 // first argument of _cgo_init is g
+	BL.NE	(R2) // will clobber R0-R3
+
+	BL	runtime·checkgoarm(SB)
 	BL	runtime·check(SB)
 
 	// saved argc, argv
@@ -43,7 +50,7 @@ TEXT _rt0_arm(SB),7,$-4
 	BL	runtime·schedinit(SB)
 
 	// create a new goroutine to start program
-	MOVW	$runtime·main(SB), R0
+	MOVW	$runtime·main·f(SB), R0
 	MOVW.W	R0, -4(R13)
 	MOVW	$8, R0
 	MOVW.W	R0, -4(R13)
@@ -58,24 +65,25 @@ TEXT _rt0_arm(SB),7,$-4
 	MOVW	$1234, R0
 	MOVW	$1000, R1
 	MOVW	R0, (R1)	// fail hard
-	B	runtime·_dep_dummy(SB)	// Never reached
-
-// TODO(kaib): remove these once i actually understand how the linker removes symbols
-// pull in dummy dependencies
-TEXT runtime·_dep_dummy(SB),7,$0
-	BL	_div(SB)
-	BL	_divu(SB)
-	BL	_mod(SB)
-	BL	_modu(SB)
-	BL	_modu(SB)
-	BL	_sfloat(SB)
+
+DATA	runtime·main·f+0(SB)/4,$runtime·main(SB)
+GLOBL	runtime·main·f(SB),8,$4
 
 TEXT runtime·breakpoint(SB),7,$0
-	// no breakpoint yet; let program exit
+	// gdb won't skip this breakpoint instruction automatically,
+	// so you must manually "set $pc+=4" to skip it and continue.
+	WORD    $0xe1200071 // BKPT 0x0001
 	RET
 
+GLOBL runtime·goarm(SB), $4
 TEXT runtime·asminit(SB),7,$0
-	// No per-thread init.
+	// disable runfast (flush-to-zero) mode of vfp if runtime.goarm > 5
+	MOVW runtime·goarm(SB), R11
+	CMP $5, R11
+	BLE 4(PC)
+	WORD $0xeef1ba10	// vmrs r11, fpscr
+	BIC $(1<<24), R11
+	WORD $0xeee1ba10	// vmsr fpscr, r11
 	RET
 
 /*
@@ -95,26 +103,49 @@ TEXT runtime·gosave(SB), 7, $-4
 // restore state from Gobuf; longjmp
 TEXT runtime·gogo(SB), 7, $-4
 	MOVW	0(FP), R1		// gobuf
-	MOVW	4(FP), R0		// return 2nd arg
 	MOVW	gobuf_g(R1), g
 	MOVW	0(g), R2		// make sure g != nil
+	MOVW	_cgo_save_gm(SB), R2
+	CMP 	$0, R2 // if in Cgo, we have to save g and m
+	BL.NE	(R2) // this call will clobber R0
+	MOVW	4(FP), R0		// return 2nd arg
 	MOVW	gobuf_sp(R1), SP	// restore SP
 	MOVW	gobuf_pc(R1), PC
 
-// void gogocall(Gobuf*, void (*fn)(void))
+// void gogocall(Gobuf*, void (*fn)(void), uintptr r7)
 // restore state from Gobuf but then call fn.
 // (call fn, returning to state in Gobuf)
 // using frame size $-4 means do not save LR on stack.
 TEXT runtime·gogocall(SB), 7, $-4
-	MOVW	0(FP), R0		// gobuf
+	MOVW	0(FP), R3		// gobuf
 	MOVW	4(FP), R1		// fn
-	MOVW	8(FP), R2		// fp offset
-	MOVW	gobuf_g(R0), g
-	MOVW	0(g), R3		// make sure g != nil
-	MOVW	gobuf_sp(R0), SP	// restore SP
-	MOVW	gobuf_pc(R0), LR
+	MOVW	gobuf_g(R3), g
+	MOVW	0(g), R0		// make sure g != nil
+	MOVW	_cgo_save_gm(SB), R0
+	CMP 	$0, R0 // if in Cgo, we have to save g and m
+	BL.NE	(R0) // this call will clobber R0
+	MOVW	8(FP), R7	// context
+	MOVW	gobuf_sp(R3), SP	// restore SP
+	MOVW	gobuf_pc(R3), LR
 	MOVW	R1, PC
 
+// void gogocallfn(Gobuf*, FuncVal*)
+// restore state from Gobuf but then call fn.
+// (call fn, returning to state in Gobuf)
+// using frame size $-4 means do not save LR on stack.
+TEXT runtime·gogocallfn(SB), 7, $-4
+	MOVW	0(FP), R3		// gobuf
+	MOVW	4(FP), R1		// fn
+	MOVW	gobuf_g(R3), g
+	MOVW	0(g), R0		// make sure g != nil
+	MOVW	_cgo_save_gm(SB), R0
+	CMP 	$0, R0 // if in Cgo, we have to save g and m
+	BL.NE	(R0) // this call will clobber R0
+	MOVW	gobuf_sp(R3), SP	// restore SP
+	MOVW	gobuf_pc(R3), LR
+	MOVW	R1, R7
+	MOVW	0(R1), PC
+
 // void mcall(void (*fn)(G*))
 // Switch to m->g0's stack, call fn(g).
 // Fn must never return.  It should gogo(&g->sched)
@@ -157,6 +188,7 @@ TEXT runtime·morestack(SB),7,$-4
 	BL.EQ	runtime·abort(SB)
 
 	// Save in m.
+	MOVW	R7, m_cret(m) // function context
 	MOVW	R1, m_moreframesize(m)
 	MOVW	R2, m_moreargsize(m)
 
@@ -228,28 +260,175 @@ TEXT runtime·lessstack(SB), 7, $-4
 TEXT runtime·jmpdefer(SB), 7, $0
 	MOVW	0(SP), LR
 	MOVW	$-4(LR), LR	// BL deferreturn
-	MOVW	fn+0(FP), R0
+	MOVW	fn+0(FP), R7
 	MOVW	argp+4(FP), SP
 	MOVW	$-4(SP), SP	// SP is 4 below argp, due to saved LR
-	B		(R0)
+	MOVW	0(R7), R1
+	B	(R1)
+
+// Dummy function to use in saved gobuf.PC,
+// to match SP pointing at a return address.
+// The gobuf.PC is unused by the contortions here
+// but setting it to return will make the traceback code work.
+TEXT return<>(SB),7,$0
+	RET
 
+// asmcgocall(void(*fn)(void*), void *arg)
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.c for more details.
 TEXT	runtime·asmcgocall(SB),7,$0
-	B	runtime·cgounimpl(SB)
+	MOVW	fn+0(FP), R1
+	MOVW	arg+4(FP), R0
+	MOVW	R13, R2
+	MOVW	g, R5
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	MOVW	m_g0(m), R3
+	CMP	R3, g
+	BEQ	7(PC)
+	MOVW	R13, (g_sched + gobuf_sp)(g)
+	MOVW	$return<>(SB), R4
+	MOVW	R4, (g_sched+gobuf_pc)(g)
+	MOVW	g, (g_sched+gobuf_g)(g)
+	MOVW	R3, g
+	MOVW	(g_sched+gobuf_sp)(g), R13
+
+	// Now on a scheduling stack (a pthread-created stack).
+	SUB	$24, R13
+	BIC	$0x7, R13	// alignment for gcc ABI
+	MOVW	R5, 20(R13) // save old g
+	MOVW	R2, 16(R13)	// save old SP
+	// R0 already contains the first argument
+	BL	(R1)
+
+	// Restore registers, g, stack pointer.
+	MOVW	20(R13), g
+	MOVW	16(R13), R13
+	RET
 
-TEXT	runtime·cgocallback(SB),7,$0
-	B	runtime·cgounimpl(SB)
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),7,$12
+	MOVW	$fn+0(FP), R0
+	MOVW	R0, 4(R13)
+	MOVW	frame+4(FP), R0
+	MOVW	R0, 8(R13)
+	MOVW	framesize+8(FP), R0
+	MOVW	R0, 12(R13)
+	MOVW	$runtime·cgocallback_gofunc(SB), R0
+	BL	(R0)
+	RET
+
+// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize)
+// See cgocall.c for more details.
+TEXT	runtime·cgocallback_gofunc(SB),7,$16
+	// Load m and g from thread-local storage.
+	MOVW	_cgo_load_gm(SB), R0
+	CMP	$0, R0
+	BL.NE	(R0)
+
+	// If m is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call.
+	MOVW	m, savedm-16(SP)
+	CMP	$0, m
+	B.NE havem
+	MOVW	$runtime·needm(SB), R0
+	BL	(R0)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	MOVW	fn+0(FP), R0
+	MOVW	frame+4(FP), R1
+	MOVW	framesize+8(FP), R2
+
+	MOVW	m_g0(m), R3
+	MOVW	(g_sched+gobuf_sp)(R3), R4
+	MOVW.W	R4, -4(R13)
+	MOVW	R13, (g_sched+gobuf_sp)(R3)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg
+	// with the three arguments.  Because we are taking over
+	// the execution of m->curg but *not* resuming what had
+	// been running, we need to save that information (m->curg->gobuf)
+	// so that we can restore it when we're done. 
+	// We can restore m->curg->gobuf.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->gobuf.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because we defined cgocallbackg to have
+	// a frame size of 16, the same amount that we use below),
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	MOVW	m_curg(m), g
+	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
+
+	// Push gobuf.pc
+	MOVW	(g_sched+gobuf_pc)(g), R5
+	SUB	$4, R4
+	MOVW	R5, 0(R4)
+
+	// Push arguments to cgocallbackg.
+	// Frame size here must match the frame size above
+	// to trick traceback routines into doing the right thing.
+	SUB	$16, R4
+	MOVW	R0, 4(R4)
+	MOVW	R1, 8(R4)
+	MOVW	R2, 12(R4)
+	
+	// Switch stack and make the call.
+	MOVW	R4, R13
+	BL	runtime·cgocallbackg(SB)
+
+	// Restore g->gobuf (== m->curg->gobuf) from saved values.
+	MOVW	16(R13), R5
+	MOVW	R5, (g_sched+gobuf_pc)(g)
+	ADD	$(16+4), R13 // SP clobbered! It is ok!
+	MOVW	R13, (g_sched+gobuf_sp)(g)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVW	m_g0(m), g
+	MOVW	(g_sched+gobuf_sp)(g), R13
+	// POP R6
+	MOVW	0(R13), R6
+	ADD	$4, R13
+	MOVW	R6, (g_sched+gobuf_sp)(g)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVW	savedm-16(SP), R6
+	CMP	$0, R6
+	B.NE	3(PC)
+	MOVW	$runtime·dropm(SB), R0
+	BL	(R0)
+
+	// Done!
+	RET
+
+// void setmg(M*, G*); set m and g. for use by needm.
+TEXT runtime·setmg(SB), 7, $-4
+	MOVW	mm+0(FP), m
+	MOVW	gg+4(FP), g
+
+	// Save m and g to thread-local storage.
+	MOVW	_cgo_save_gm(SB), R0
+	CMP	$0, R0
+	BL.NE	(R0)
 
-TEXT runtime·memclr(SB),7,$20
-	MOVW	0(FP), R0
-	MOVW	$0, R1		// c = 0
-	MOVW	R1, -16(SP)
-	MOVW	4(FP), R1	// n
-	MOVW	R1, -12(SP)
-	MOVW	m, -8(SP)	// Save m and g
-	MOVW	g, -4(SP)
-	BL	runtime·memset(SB)
-	MOVW	-8(SP), m	// Restore m and g, memset clobbers them
-	MOVW	-4(SP), g
 	RET
 
 TEXT runtime·getcallerpc(SB),7,$-4
@@ -269,16 +448,6 @@ TEXT runtime·getcallersp(SB),7,$-4
 TEXT runtime·emptyfunc(SB),0,$0
 	RET
 
-// int64 runtime·cputicks(), so really
-// void runtime·cputicks(int64 *ticks)
-// stubbed: return int64(0)
-TEXT runtime·cputicks(SB),7,$0
-	MOVW    0(FP), R1
-	MOVW	$0, R0
-	MOVW    R0, 0(R1)
-	MOVW    R0, 4(R1)
-	RET
-
 TEXT runtime·abort(SB),7,$-4
 	MOVW	$0, R0
 	MOVW	(R0), R1
diff --git a/src/pkg/runtime/atomic_386.c b/src/pkg/runtime/atomic_386.c
index a4f2a114f..79b7cbf96 100644
--- a/src/pkg/runtime/atomic_386.c
+++ b/src/pkg/runtime/atomic_386.c
@@ -17,3 +17,16 @@ runtime·atomicloadp(void* volatile* addr)
 {
 	return *addr;
 }
+
+#pragma textflag 7
+uint64
+runtime·xadd64(uint64 volatile* addr, int64 v)
+{
+	uint64 old;
+
+	old = *addr;
+	while(!runtime·cas64(addr, &old, old+v)) {
+		// nothing
+	}
+	return old+v;
+}
diff --git a/src/pkg/runtime/atomic_amd64.c b/src/pkg/runtime/atomic_amd64.c
index a4f2a114f..e92d8ec21 100644
--- a/src/pkg/runtime/atomic_amd64.c
+++ b/src/pkg/runtime/atomic_amd64.c
@@ -12,6 +12,13 @@ runtime·atomicload(uint32 volatile* addr)
 }
 
 #pragma textflag 7
+uint64
+runtime·atomicload64(uint64 volatile* addr)
+{
+	return *addr;
+}
+
+#pragma textflag 7
 void*
 runtime·atomicloadp(void* volatile* addr)
 {
diff --git a/src/pkg/runtime/atomic_arm.c b/src/pkg/runtime/atomic_arm.c
index 52e4059ae..0b54840cc 100644
--- a/src/pkg/runtime/atomic_arm.c
+++ b/src/pkg/runtime/atomic_arm.c
@@ -3,6 +3,14 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "arch_GOARCH.h"
+
+static union {
+	Lock l;
+	byte pad [CacheLineSize];
+} locktab[57];
+
+#define LOCK(addr) (&locktab[((uintptr)(addr)>>3)%nelem(locktab)].l)
 
 // Atomic add and return new value.
 #pragma textflag 7
@@ -80,4 +88,56 @@ runtime·atomicstore(uint32 volatile* addr, uint32 v)
 		if(runtime·cas(addr, old, v))
 			return;
 	}
-}
-\ No newline at end of file
+}
+
+#pragma textflag 7
+bool
+runtime·cas64(uint64 volatile *addr, uint64 *old, uint64 new)
+{
+	bool res;
+	
+	runtime·lock(LOCK(addr));
+	if(*addr == *old) {
+		*addr = new;
+		res = true;
+	} else {
+		*old = *addr;
+		res = false;
+	}
+	runtime·unlock(LOCK(addr));
+	return res;
+}
+
+#pragma textflag 7
+uint64
+runtime·xadd64(uint64 volatile *addr, int64 delta)
+{
+	uint64 res;
+	
+	runtime·lock(LOCK(addr));
+	res = *addr + delta;
+	*addr = res;
+	runtime·unlock(LOCK(addr));
+	return res;
+}
+
+#pragma textflag 7
+uint64
+runtime·atomicload64(uint64 volatile *addr)
+{
+	uint64 res;
+	
+	runtime·lock(LOCK(addr));
+	res = *addr;
+	runtime·unlock(LOCK(addr));
+	return res;
+}
+
+#pragma textflag 7
+void
+runtime·atomicstore64(uint64 volatile *addr, uint64 v)
+{
+	runtime·lock(LOCK(addr));
+	*addr = v;
+	runtime·unlock(LOCK(addr));
+}
diff --git a/src/pkg/runtime/callback_windows_386.c b/src/pkg/runtime/callback_windows_386.c
index fcd292fbc..880588da6 100644
--- a/src/pkg/runtime/callback_windows_386.c
+++ b/src/pkg/runtime/callback_windows_386.c
@@ -4,6 +4,7 @@
 
 #include "runtime.h"
 #include "type.h"
+#include "typekind.h"
 #include "defs_GOOS_GOARCH.h"
 #include "os_GOOS.h"
 
@@ -79,7 +80,7 @@ runtime·compilecallback(Eface fn, bool cleanstack)
 
 	// MOVL fn, AX
 	*p++ = 0xb8;
-	*(uint32*)p = (uint32)fn.data;
+	*(uint32*)p = (uint32)(fn.data);
 	p += 4;
 
 	// MOVL argsize, DX
diff --git a/src/pkg/runtime/callback_windows_amd64.c b/src/pkg/runtime/callback_windows_amd64.c
index 99d7cb9e3..1a4779291 100644
--- a/src/pkg/runtime/callback_windows_amd64.c
+++ b/src/pkg/runtime/callback_windows_amd64.c
@@ -4,6 +4,7 @@
 
 #include "runtime.h"
 #include "type.h"
+#include "typekind.h"
 #include "defs_GOOS_GOARCH.h"
 #include "os_GOOS.h"
 
@@ -77,7 +78,7 @@ runtime·compilecallback(Eface fn, bool /*cleanstack*/)
 	// MOVQ fn, AX
 	*p++ = 0x48;
 	*p++ = 0xb8;
-	*(uint64*)p = (uint64)fn.data;
+	*(uint64*)p = (uint64)(fn.data);
 	p += 8;
 	// PUSH AX
 	*p++ = 0x50;
diff --git a/src/pkg/runtime/cgo/asm_386.s b/src/pkg/runtime/cgo/asm_386.s
new file mode 100644
index 000000000..7faaa4097
--- /dev/null
+++ b/src/pkg/runtime/cgo/asm_386.s
@@ -0,0 +1,29 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),7,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	PUSHL	BX
+	PUSHL	SI
+	PUSHL	DI
+	
+	SUBL	$8, SP
+	MOVL	16(BP), AX
+	MOVL	AX, 4(SP)
+	MOVL	12(BP), AX
+	MOVL	AX, 0(SP)
+	MOVL	8(BP), AX
+	CALL	AX
+	ADDL	$8, SP
+	
+	POPL	DI
+	POPL	SI
+	POPL	BX
+	POPL	BP
+	RET
diff --git a/src/pkg/runtime/cgo/asm_amd64.s b/src/pkg/runtime/cgo/asm_amd64.s
new file mode 100644
index 000000000..53f7148a2
--- /dev/null
+++ b/src/pkg/runtime/cgo/asm_amd64.s
@@ -0,0 +1,45 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),7,$0
+	SUBQ	$0x58, SP	/* keeps stack pointer 32-byte aligned */
+	MOVQ	BX, 0x10(SP)
+	MOVQ	BP, 0x18(SP)
+	MOVQ	R12, 0x20(SP)
+	MOVQ	R13, 0x28(SP)
+	MOVQ	R14, 0x30(SP)
+	MOVQ	R15, 0x38(SP)
+
+#ifdef GOOS_windows
+	// Win64 save RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15
+	MOVQ	DI, 0x40(SP)
+	MOVQ	SI, 0x48(SP)
+
+	MOVQ	DX, 0(SP)	/* arg */
+	MOVQ	R8, 8(SP)	/* argsize (includes padding) */
+	
+	CALL	CX	/* fn */
+	
+	MOVQ	0x40(SP), DI
+	MOVQ	0x48(SP), SI
+#else
+	MOVQ	SI, 0(SP)	/* arg */
+	MOVQ	DX, 8(SP)	/* argsize (includes padding) */
+
+	CALL	DI	/* fn */
+#endif
+
+	MOVQ	0x10(SP), BX
+	MOVQ	0x18(SP), BP
+	MOVQ	0x20(SP), R12
+	MOVQ	0x28(SP), R13
+	MOVQ	0x30(SP), R14
+	MOVQ	0x38(SP), R15
+	
+	ADDQ	$0x58, SP
+	RET
diff --git a/src/pkg/runtime/cgo/asm_arm.s b/src/pkg/runtime/cgo/asm_arm.s
new file mode 100644
index 000000000..a6ea0dc07
--- /dev/null
+++ b/src/pkg/runtime/cgo/asm_arm.s
@@ -0,0 +1,23 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * void crosscall2(void (*fn)(void*, int32), void*, int32)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),7,$-4
+	/* 
+	 * We still need to save all callee save register as before, and then
+	 *  push 2 args for fn (R1 and R2).
+	 * Also note that at procedure entry in 5c/5g world, 4(R13) will be the
+	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
+	 *  Additionally, cgo_tls_set_gm will clobber R0, so we need to save R0
+	 *  nevertheless.
+	 */
+	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, R9, R10, R11, R12, R14], (R13)
+	MOVW	_cgo_load_gm(SB), R0
+	BL	(R0)
+	MOVW	PC, R14
+	MOVW	0(R13), PC
+	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, R9, R10, R11, R12, PC]
diff --git a/src/pkg/runtime/cgo/callbacks.c b/src/pkg/runtime/cgo/callbacks.c
index f36fb3fd7..51bd529ec 100644
--- a/src/pkg/runtime/cgo/callbacks.c
+++ b/src/pkg/runtime/cgo/callbacks.c
@@ -33,7 +33,13 @@
 static void
 _cgo_allocate_internal(uintptr len, byte *ret)
 {
+	CgoMal *c;
+
 	ret = runtime·mal(len);
+	c = runtime·mal(sizeof(*c));
+	c->next = m->cgomal;
+	c->alloc = ret;
+	m->cgomal = c;
 	FLUSH(&ret);
 }
 
@@ -71,3 +77,19 @@ _cgo_panic(void *a, int32 n)
 {
 	runtime·cgocallback((void(*)(void))_cgo_panic_internal, a, n);
 }
+
+#pragma cgo_import_static x_cgo_init
+extern void x_cgo_init(G*);
+void (*_cgo_init)(G*) = x_cgo_init;
+
+#pragma cgo_import_static x_cgo_malloc
+extern void x_cgo_malloc(void*);
+void (*_cgo_malloc)(void*) = x_cgo_malloc;
+
+#pragma cgo_import_static x_cgo_free
+extern void x_cgo_free(void*);
+void (*_cgo_free)(void*) = x_cgo_free;
+
+#pragma cgo_import_static x_cgo_thread_start
+extern void x_cgo_thread_start(void*);
+void (*_cgo_thread_start)(void*) = x_cgo_thread_start;
diff --git a/src/pkg/runtime/cgo/cgo.go b/src/pkg/runtime/cgo/cgo.go
index 414f3da36..e0d538668 100644
--- a/src/pkg/runtime/cgo/cgo.go
+++ b/src/pkg/runtime/cgo/cgo.go
@@ -18,6 +18,8 @@ package cgo
 #cgo openbsd LDFLAGS: -lpthread
 #cgo windows LDFLAGS: -lm -mthreads
 
+#cgo CFLAGS: -Wall -Werror
+
 */
 import "C"
 
diff --git a/src/pkg/runtime/cgo/cgo_arm.c b/src/pkg/runtime/cgo/cgo_arm.c
new file mode 100644
index 000000000..d23f53e77
--- /dev/null
+++ b/src/pkg/runtime/cgo/cgo_arm.c
@@ -0,0 +1,12 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#pragma cgo_import_static x_cgo_load_gm
+extern void x_cgo_load_gm(void);
+void (*_cgo_load_gm)(void) = x_cgo_load_gm;
+
+#pragma cgo_import_static x_cgo_save_gm
+extern void x_cgo_save_gm(void);
+void (*_cgo_save_gm)(void) = x_cgo_save_gm;
+
diff --git a/src/pkg/runtime/cgo/gcc_386.S b/src/pkg/runtime/cgo/gcc_386.S
index 9abab7ebd..94ba5842f 100644
--- a/src/pkg/runtime/cgo/gcc_386.S
+++ b/src/pkg/runtime/cgo/gcc_386.S
@@ -35,31 +35,6 @@ EXT(crosscall_386):
 	popl %ebp
 	ret
 
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- *
- * Save registers and call fn with two arguments.
- */
-.globl EXT(crosscall2)
-EXT(crosscall2):
-	pushl %ebp
-	movl %esp, %ebp
-	pushl %ebx
-	pushl %esi
-	pushl %edi
-
-	pushl 16(%ebp)
-	pushl 12(%ebp)
-	mov 8(%ebp), %eax
-	call *%eax
-	addl $8,%esp
-
-	popl %edi
-	popl %esi
-	popl %ebx
-	popl %ebp
-	ret
-
 .globl EXT(__stack_chk_fail_local)
 EXT(__stack_chk_fail_local):
 1:
diff --git a/src/pkg/runtime/cgo/gcc_amd64.S b/src/pkg/runtime/cgo/gcc_amd64.S
index 706ee6b58..81b270195 100644
--- a/src/pkg/runtime/cgo/gcc_amd64.S
+++ b/src/pkg/runtime/cgo/gcc_amd64.S
@@ -19,9 +19,6 @@
  * are callee-save so they must be saved explicitly.
  * The standard x86-64 ABI passes the three arguments m, g, fn
  * in %rdi, %rsi, %rdx.
- *
- * Also need to set %r15 to g and %r14 to m (see ../pkg/runtime/mkasmh.sh)
- * during the call.
  */
 .globl EXT(crosscall_amd64)
 EXT(crosscall_amd64):
@@ -45,48 +42,3 @@ EXT(crosscall_amd64):
 	popq %rbp
 	popq %rbx
 	ret
-
-/*
- * void crosscall2(void (*fn)(void*, int32), void *arg, int32 argsize)
- *
- * Save registers and call fn with two arguments.  fn is a Go function
- * which takes parameters on the stack rather than in registers.
- */
-.globl EXT(crosscall2)
-EXT(crosscall2):
-	subq  $0x58, %rsp	/* keeps stack pointer 32-byte aligned */
-	movq  %rbx, 0x10(%rsp)
-	movq  %rbp, 0x18(%rsp)
-	movq  %r12, 0x20(%rsp)
-	movq  %r13, 0x28(%rsp)
-	movq  %r14, 0x30(%rsp)
-	movq  %r15, 0x38(%rsp)
-
-#if defined(_WIN64)
-	// Win64 save RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15
-	movq	%rdi, 0x40(%rsp)
-	movq	%rsi, 0x48(%rsp)
-
-	movq  %rdx, 0(%rsp)	/* arg */
-	movq  %r8, 8(%rsp)	/* argsize (includes padding) */
-	
-	call *%rcx	/* fn */
-#else
-	movq  %rsi, 0(%rsp)	/* arg */
-	movq  %rdx, 8(%rsp)	/* argsize (includes padding) */
-
-	call *%rdi	/* fn */
-#endif
-
-	movq  0x10(%rsp), %rbx
-	movq  0x18(%rsp), %rbp
-	movq  0x20(%rsp), %r12
-	movq  0x28(%rsp), %r13
-	movq  0x30(%rsp), %r14
-	movq  0x38(%rsp), %r15
-#if defined(__WIN64)
-	movq	0x40(%rsp), %rdi
-	movq	0x48(%rsp), %rsi
-#endif	
-	addq  $0x58, %rsp
-	ret
diff --git a/src/pkg/runtime/cgo/gcc_arm.S b/src/pkg/runtime/cgo/gcc_arm.S
index 32d862984..809fcb9a0 100644
--- a/src/pkg/runtime/cgo/gcc_arm.S
+++ b/src/pkg/runtime/cgo/gcc_arm.S
@@ -1 +1,36 @@
-/* unimplemented */
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * Apple still insists on underscore prefixes for C function names.
+ */
+#if defined(__APPLE__)
+#define EXT(s) _##s
+#else
+#define EXT(s) s
+#endif
+
+/*
+ * void crosscall_arm2(void (*fn)(void), void *g, void *m)
+ *
+ * Calling into the 5c tool chain, where all registers are caller save.
+ * Called from standard ARM EABI, where r4-r11 are callee-save, so they
+ * must be saved explicitly.
+ */
+.globl EXT(crosscall_arm2)
+EXT(crosscall_arm2):
+	push {r4, r5, r6, r7, r8, r9, r10, r11, ip, lr}
+	mov r10, r1 // g
+	mov r9, r2 // m
+	mov r3, r0 // save r0, cgo_tls_set_gm will clobber it
+	bl EXT(x_cgo_save_gm) // save current g and m into TLS variable
+	mov lr, pc
+	mov pc, r3
+	pop {r4, r5, r6, r7, r8, r9, r10, r11, ip, pc}
+
+.globl EXT(__stack_chk_fail_local)
+EXT(__stack_chk_fail_local):
+1:
+	b 1b
+
diff --git a/src/pkg/runtime/cgo/gcc_darwin_386.c b/src/pkg/runtime/cgo/gcc_darwin_386.c
index 2c30c666f..ad9fb5abf 100644
--- a/src/pkg/runtime/cgo/gcc_darwin_386.c
+++ b/src/pkg/runtime/cgo/gcc_darwin_386.c
@@ -101,8 +101,8 @@ inittls(void)
 		pthread_key_delete(tofree[i]);
 }
 
-static void
-xinitcgo(G *g)
+void
+x_cgo_init(G *g)
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -115,10 +115,9 @@ xinitcgo(G *g)
 	inittls();
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	pthread_attr_t attr;
 	sigset_t ign, oset;
@@ -127,14 +126,14 @@ libcgo_sys_thread_start(ThreadStart *ts)
 	int err;
 
 	sigfillset(&ign);
-	sigprocmask(SIG_SETMASK, &ign, &oset);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	ts->g->stackguard = size;
 	err = pthread_create(&p, &attr, threadentry, ts);
 
-	sigprocmask(SIG_SETMASK, &oset, nil);
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
 
 	if (err != 0) {
 		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
@@ -153,7 +152,7 @@ threadentry(void *v)
 	ts.g->stackbase = (uintptr)&ts;
 
 	/*
-	 * libcgo_sys_thread_start set stackguard to stack size;
+	 * _cgo_sys_thread_start set stackguard to stack size;
 	 * change to actual guard pointer.
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
diff --git a/src/pkg/runtime/cgo/gcc_darwin_amd64.c b/src/pkg/runtime/cgo/gcc_darwin_amd64.c
index 89dc7a4e8..65d381633 100644
--- a/src/pkg/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_darwin_amd64.c
@@ -72,7 +72,7 @@ inittls(void)
 }
 
 void
-xinitcgo(G *g)
+x_cgo_init(G *g)
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -85,10 +85,9 @@ xinitcgo(G *g)
 	inittls();
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	pthread_attr_t attr;
 	sigset_t ign, oset;
@@ -97,14 +96,14 @@ libcgo_sys_thread_start(ThreadStart *ts)
 	int err;
 
 	sigfillset(&ign);
-	sigprocmask(SIG_SETMASK, &ign, &oset);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	ts->g->stackguard = size;
 	err = pthread_create(&p, &attr, threadentry, ts);
 
-	sigprocmask(SIG_SETMASK, &oset, nil);
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
 
 	if (err != 0) {
 		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
@@ -123,7 +122,7 @@ threadentry(void *v)
 	ts.g->stackbase = (uintptr)&ts;
 
 	/*
-	 * libcgo_sys_thread_start set stackguard to stack size;
+	 * _cgo_sys_thread_start set stackguard to stack size;
 	 * change to actual guard pointer.
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_386.c b/src/pkg/runtime/cgo/gcc_freebsd_386.c
index 2c97e2a33..7c62a1bc4 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_386.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_386.c
@@ -6,12 +6,13 @@
 #include <sys/signalvar.h>
 #include <pthread.h>
 #include <signal.h>
+#include <string.h>
 #include "libcgo.h"
 
 static void* threadentry(void*);
 
-static void
-xinitcgo(G *g)
+void
+x_cgo_init(G *g)
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -22,10 +23,9 @@ xinitcgo(G *g)
 	pthread_attr_destroy(&attr);
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	pthread_attr_t attr;
 	sigset_t ign, oset;
@@ -60,7 +60,7 @@ threadentry(void *v)
 	ts.g->stackbase = (uintptr)&ts;
 
 	/*
-	 * libcgo_sys_thread_start set stackguard to stack size;
+	 * _cgo_sys_thread_start set stackguard to stack size;
 	 * change to actual guard pointer.
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_amd64.c b/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
index 3beb4d7bb..6be8bd251 100644
--- a/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_freebsd_amd64.c
@@ -6,12 +6,13 @@
 #include <sys/signalvar.h>
 #include <pthread.h>
 #include <signal.h>
+#include <string.h>
 #include "libcgo.h"
 
 static void* threadentry(void*);
 
-static void
-xinitcgo(G *g)
+void
+x_cgo_init(G *g)
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -22,10 +23,9 @@ xinitcgo(G *g)
 	pthread_attr_destroy(&attr);
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	pthread_attr_t attr;
 	sigset_t ign, oset;
@@ -61,7 +61,7 @@ threadentry(void *v)
 	ts.g->stackbase = (uintptr)&ts;
 
 	/*
-	 * libcgo_sys_thread_start set stackguard to stack size;
+	 * _cgo_sys_thread_start set stackguard to stack size;
 	 * change to actual guard pointer.
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
diff --git a/src/pkg/runtime/cgo/gcc_freebsd_arm.c b/src/pkg/runtime/cgo/gcc_freebsd_arm.c
new file mode 100644
index 000000000..3bcb0b270
--- /dev/null
+++ b/src/pkg/runtime/cgo/gcc_freebsd_arm.c
@@ -0,0 +1,114 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+// We have to resort to TLS variable to save g(R10) and
+// m(R9). One reason is that external code might trigger
+// SIGSEGV, and our runtime.sigtramp don't even know we
+// are in external code, and will continue to use R10/R9,
+// this might as well result in another SIGSEGV.
+// Note: all three functions will clobber R0, and the last
+// two can be called from 5c ABI code.
+void __aeabi_read_tp(void) __attribute__((naked));
+void x_cgo_save_gm(void) __attribute__((naked));
+void x_cgo_load_gm(void) __attribute__((naked));
+
+void
+__aeabi_read_tp(void)
+{
+	// read @ 0xffff1000
+	__asm__ __volatile__ (
+		"ldr r0, =0xffff1000\n\t"
+		"ldr r0, [r0]\n\t"
+		"mov pc, lr\n\t"
+	);
+}
+
+// g (R10) at 8(TP), m (R9) at 12(TP)
+void
+x_cgo_load_gm(void)
+{
+	__asm__ __volatile__ (
+		"push {lr}\n\t"
+		"bl __aeabi_read_tp\n\t"
+		"ldr r10, [r0, #8]\n\t"
+		"ldr r9, [r0, #12]\n\t"
+		"pop {pc}\n\t"
+	);
+}
+
+void
+x_cgo_save_gm(void)
+{
+	__asm__ __volatile__ (
+		"push {lr}\n\t"
+		"bl __aeabi_read_tp\n\t"
+		"str r10, [r0, #8]\n\t"
+		"str r9, [r0, #12]\n\t"
+		"pop {pc}\n\t"
+	);
+}
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+	x_cgo_save_gm(); // save g and m for the initial thread
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	ts->g->stackguard = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall_arm2(void (*fn)(void), void *g, void *m);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
+
+	crosscall_arm2(ts.fn, (void *)ts.g, (void *)ts.m);
+	return nil;
+}
diff --git a/src/pkg/runtime/cgo/gcc_linux_386.c b/src/pkg/runtime/cgo/gcc_linux_386.c
index 7d84acc11..9357a63f7 100644
--- a/src/pkg/runtime/cgo/gcc_linux_386.c
+++ b/src/pkg/runtime/cgo/gcc_linux_386.c
@@ -9,8 +9,8 @@
 
 static void *threadentry(void*);
 
-static void
-xinitcgo(G *g)
+void
+x_cgo_init(G *g)
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -21,10 +21,9 @@ xinitcgo(G *g)
 	pthread_attr_destroy(&attr);
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	pthread_attr_t attr;
 	sigset_t ign, oset;
@@ -64,7 +63,7 @@ threadentry(void *v)
 	ts.g->stackbase = (uintptr)&ts;
 
 	/*
-	 * libcgo_sys_thread_start set stackguard to stack size;
+	 * _cgo_sys_thread_start set stackguard to stack size;
 	 * change to actual guard pointer.
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
diff --git a/src/pkg/runtime/cgo/gcc_linux_amd64.c b/src/pkg/runtime/cgo/gcc_linux_amd64.c
index 28cbf78c5..bc76117d3 100644
--- a/src/pkg/runtime/cgo/gcc_linux_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_linux_amd64.c
@@ -10,7 +10,7 @@
 static void* threadentry(void*);
 
 void
-xinitcgo(G* g)
+x_cgo_init(G* g)
 {
 	pthread_attr_t attr;
 	size_t size;
@@ -21,10 +21,9 @@ xinitcgo(G* g)
 	pthread_attr_destroy(&attr);
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	pthread_attr_t attr;
 	sigset_t ign, oset;
@@ -59,7 +58,7 @@ threadentry(void *v)
 	ts.g->stackbase = (uintptr)&ts;
 
 	/*
-	 * libcgo_sys_thread_start set stackguard to stack size;
+	 * _cgo_sys_thread_start set stackguard to stack size;
 	 * change to actual guard pointer.
 	 */
 	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
diff --git a/src/pkg/runtime/cgo/gcc_linux_arm.c b/src/pkg/runtime/cgo/gcc_linux_arm.c
index 8397c75bb..46a1126ad 100644
--- a/src/pkg/runtime/cgo/gcc_linux_arm.c
+++ b/src/pkg/runtime/cgo/gcc_linux_arm.c
@@ -2,19 +2,113 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <pthread.h>
+#include <string.h>
 #include "libcgo.h"
 
-static void
-xinitcgo(G *g)
+static void *threadentry(void*);
+
+// We have to resort to TLS variable to save g(R10) and
+// m(R9). One reason is that external code might trigger
+// SIGSEGV, and our runtime.sigtramp don't even know we
+// are in external code, and will continue to use R10/R9,
+// this might as well result in another SIGSEGV.
+// Note: all three functions will clobber R0, and the last
+// two can be called from 5c ABI code.
+void __aeabi_read_tp(void) __attribute__((naked));
+void x_cgo_save_gm(void) __attribute__((naked));
+void x_cgo_load_gm(void) __attribute__((naked));
+
+void
+__aeabi_read_tp(void)
+{
+	// b __kuser_get_tls @ 0xffff0fe0
+	__asm__ __volatile__ (
+		"mvn r0, #0xf000\n\t"
+		"sub pc, r0, #31\n\t"
+		"nop\n\tnop\n\t"
+	);
+}
+
+// g (R10) at 8(TP), m (R9) at 12(TP)
+void
+x_cgo_load_gm(void)
+{
+	__asm__ __volatile__ (
+		"push {lr}\n\t"
+		"bl __aeabi_read_tp\n\t"
+		"ldr r10, [r0, #8]\n\t"
+		"ldr r9, [r0, #12]\n\t"
+		"pop {pc}\n\t"
+	);
+}
+
+void
+x_cgo_save_gm(void)
+{
+	__asm__ __volatile__ (
+		"push {lr}\n\t"
+		"bl __aeabi_read_tp\n\t"
+		"str r10, [r0, #8]\n\t"
+		"str r9, [r0, #12]\n\t"
+		"pop {pc}\n\t"
+	);
+}
+
+void
+x_cgo_init(G *g)
 {
-	// unimplemented
+	pthread_attr_t attr;
+	size_t size;
+	x_cgo_save_gm(); // save g and m for the initial thread
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	ts->g->stackguard = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall_arm2(void (*fn)(void), void *g, void *m);
+static void*
+threadentry(void *v)
 {
-	// unimplemented
-	*(int*)0 = 0;
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
+
+	crosscall_arm2(ts.fn, (void *)ts.g, (void *)ts.m);
+	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_386.c b/src/pkg/runtime/cgo/gcc_netbsd_386.c
new file mode 100644
index 000000000..09b271df4
--- /dev/null
+++ b/src/pkg/runtime/cgo/gcc_netbsd_386.c
@@ -0,0 +1,80 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	sigprocmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	ts->g->stackguard = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	sigprocmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
+
+	/*
+	 * Set specific keys.  On NetBSD/ELF, the thread local storage
+	 * is just before %gs:0.  Our dynamic 8.out's reserve 8 bytes
+	 * for the two words g and m at %gs:-8 and %gs:-4.
+	 */
+	asm volatile (
+		"movl %0, %%gs:-8\n"	// MOVL g, -8(GS)
+		"movl %1, %%gs:-4\n"	// MOVL m, -4(GS)
+		:: "r"(ts.g), "r"(ts.m)
+	);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_amd64.c b/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
new file mode 100644
index 000000000..080c59ba4
--- /dev/null
+++ b/src/pkg/runtime/cgo/gcc_netbsd_amd64.c
@@ -0,0 +1,80 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	sigprocmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	ts->g->stackguard = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	sigprocmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
+
+	/*
+	 * Set specific keys.  On NetBSD/ELF, the thread local storage
+	 * is just before %fs:0.  Our dynamic 6.out's reserve 16 bytes
+	 * for the two words g and m at %fs:-16 and %fs:-8.
+	 */
+	asm volatile (
+		"movq %0, %%fs:-16\n"	// MOVL g, -16(FS)
+		"movq %1, %%fs:-8\n"	// MOVL m, -8(FS)
+		:: "r"(ts.g), "r"(ts.m)
+	);
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/pkg/runtime/cgo/gcc_netbsd_arm.c b/src/pkg/runtime/cgo/gcc_netbsd_arm.c
new file mode 100644
index 000000000..d93b531e7
--- /dev/null
+++ b/src/pkg/runtime/cgo/gcc_netbsd_arm.c
@@ -0,0 +1,122 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+// We have to resort to TLS variable to save g(R10) and
+// m(R9). One reason is that external code might trigger
+// SIGSEGV, and our runtime.sigtramp don't even know we
+// are in external code, and will continue to use R10/R9,
+// this might as well result in another SIGSEGV.
+// Note: all three functions will clobber R0, and the last
+// two can be called from 5c ABI code.
+void __aeabi_read_tp(void) __attribute__((naked));
+void x_cgo_save_gm(void) __attribute__((naked));
+void x_cgo_load_gm(void) __attribute__((naked));
+
+void
+__aeabi_read_tp(void)
+{
+	// this function is only allowed to clobber r0
+	__asm__ __volatile__ (
+		"mrc p15, 0, r0, c13, c0, 3\n\t"
+		"cmp r0, #0\n\t"
+		"movne pc, lr\n\t"
+		"push {r1,r2,r3,r12}\n\t"
+		"svc 0x00a0013c\n\t" // _lwp_getprivate
+		"pop {r1,r2,r3,r12}\n\t"
+		"mov pc, lr\n\t"
+	);
+}
+
+// g (R10) at 8(TP), m (R9) at 12(TP)
+void
+x_cgo_load_gm(void)
+{
+	__asm__ __volatile__ (
+		"push {lr}\n\t"
+		"bl __aeabi_read_tp\n\t"
+		"ldr r10, [r0, #8]\n\t"
+		"ldr r9, [r0, #12]\n\t"
+		"pop {pc}\n\t"
+	);
+}
+
+void
+x_cgo_save_gm(void)
+{
+	__asm__ __volatile__ (
+		"push {lr}\n\t"
+		"bl __aeabi_read_tp\n\t"
+		"str r10, [r0, #8]\n\t"
+		"str r9, [r0, #12]\n\t"
+		"pop {pc}\n\t"
+	);
+}
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+	x_cgo_save_gm(); // save g and m for the initial thread
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	sigprocmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	ts->g->stackguard = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	sigprocmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall_arm2(void (*fn)(void), void *g, void *m);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096 * 2;
+
+	crosscall_arm2(ts.fn, (void *)ts.g, (void *)ts.m);
+	return nil;
+}
diff --git a/src/pkg/runtime/cgo/gcc_openbsd_386.c b/src/pkg/runtime/cgo/gcc_openbsd_386.c
new file mode 100644
index 000000000..86c1365ad
--- /dev/null
+++ b/src/pkg/runtime/cgo/gcc_openbsd_386.c
@@ -0,0 +1,169 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+
+// TCB_SIZE is sizeof(struct thread_control_block),
+// as defined in /usr/src/lib/librthread/tcb.h
+#define TCB_SIZE (4 * sizeof(void *))
+#define TLS_SIZE (2 * sizeof(void *))
+
+void *__get_tcb(void);
+void __set_tcb(void *);
+
+static int (*sys_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg);
+
+struct thread_args {
+	void *(*func)(void *);
+	void *arg;
+};
+
+static void
+tcb_fixup(int mainthread)
+{
+	void *newtcb, *oldtcb;
+
+	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
+	// we need to allocate our own TLS space while preserving the existing
+	// TCB that has been setup via librthread.
+
+	newtcb = malloc(TCB_SIZE + TLS_SIZE);
+	if(newtcb == NULL)
+		abort();
+
+	// The signal trampoline expects the TLS slots to be zeroed.
+	bzero(newtcb, TLS_SIZE);
+
+	oldtcb = __get_tcb();
+	bcopy(oldtcb, newtcb + TLS_SIZE, TCB_SIZE);
+	__set_tcb(newtcb + TLS_SIZE);
+
+	// The main thread TCB is a static allocation - do not try to free it.
+	if(!mainthread)
+		free(oldtcb);
+}
+
+static void *
+thread_start_wrapper(void *arg)
+{
+	struct thread_args args = *(struct thread_args *)arg;
+
+	free(arg);
+	tcb_fixup(0);
+
+	return args.func(args.arg);
+}
+
+int
+pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg)
+{
+	struct thread_args *p;
+
+	p = malloc(sizeof(*p));
+	if(p == NULL) {
+		errno = ENOMEM;
+		return -1;
+	}
+	p->func = start_routine;
+	p->arg = arg;
+
+	return sys_pthread_create(thread, attr, thread_start_wrapper, p);
+}
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+	void *handle;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	// Locate symbol for the system pthread_create function.
+	handle = dlopen("libpthread.so", RTLD_LAZY);
+	if(handle == NULL) {
+		fprintf(stderr, "dlopen: failed to load libpthread: %s\n", dlerror());
+		abort();
+	}
+	sys_pthread_create = dlsym(handle, "pthread_create");
+	if(sys_pthread_create == NULL) {
+		fprintf(stderr, "dlsym: failed to find pthread_create: %s\n", dlerror());
+		abort();
+	}
+	dlclose(handle);
+
+	tcb_fixup(1);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	sigprocmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	ts->g->stackguard = size;
+	err = sys_pthread_create(&p, &attr, threadentry, ts);
+
+	sigprocmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	tcb_fixup(0);
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
+
+	/*
+	 * Set specific keys.  On OpenBSD/ELF, the thread local storage
+	 * is just before %gs:0.  Our dynamic 8.out's reserve 8 bytes
+	 * for the two words g and m at %gs:-8 and %gs:-4.
+	 */
+	asm volatile (
+		"movl %0, %%gs:-8\n"	// MOVL g, -8(GS)
+		"movl %1, %%gs:-4\n"	// MOVL m, -4(GS)
+		:: "r"(ts.g), "r"(ts.m)
+	);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
diff --git a/src/pkg/runtime/cgo/gcc_openbsd_amd64.c b/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
new file mode 100644
index 000000000..d3a5e36b0
--- /dev/null
+++ b/src/pkg/runtime/cgo/gcc_openbsd_amd64.c
@@ -0,0 +1,169 @@
+// Copyright 2009 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+
+// TCB_SIZE is sizeof(struct thread_control_block),
+// as defined in /usr/src/lib/librthread/tcb.h
+#define TCB_SIZE (4 * sizeof(void *))
+#define TLS_SIZE (2 * sizeof(void *))
+
+void *__get_tcb(void);
+void __set_tcb(void *);
+
+static int (*sys_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg);
+
+struct thread_args {
+	void *(*func)(void *);
+	void *arg;
+};
+
+static void
+tcb_fixup(int mainthread)
+{
+	void *newtcb, *oldtcb;
+
+	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
+	// we need to allocate our own TLS space while preserving the existing
+	// TCB that has been setup via librthread.
+
+	newtcb = malloc(TCB_SIZE + TLS_SIZE);
+	if(newtcb == NULL)
+		abort();
+
+	// The signal trampoline expects the TLS slots to be zeroed.
+	bzero(newtcb, TLS_SIZE);
+
+	oldtcb = __get_tcb();
+	bcopy(oldtcb, newtcb + TLS_SIZE, TCB_SIZE);
+	__set_tcb(newtcb + TLS_SIZE);
+
+	// The main thread TCB is a static allocation - do not try to free it.
+	if(!mainthread)
+		free(oldtcb);
+}
+
+static void *
+thread_start_wrapper(void *arg)
+{
+	struct thread_args args = *(struct thread_args *)arg;
+
+	free(arg);
+	tcb_fixup(0);
+
+	return args.func(args.arg);
+}
+
+int
+pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+	void *(*start_routine)(void *), void *arg)
+{
+	struct thread_args *p;
+
+	p = malloc(sizeof(*p));
+	if(p == NULL) {
+		errno = ENOMEM;
+		return -1;
+	}
+	p->func = start_routine;
+	p->arg = arg;
+
+	return sys_pthread_create(thread, attr, thread_start_wrapper, p);
+}
+
+void
+x_cgo_init(G *g)
+{
+	pthread_attr_t attr;
+	size_t size;
+	void *handle;
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stackguard = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	// Locate symbol for the system pthread_create function.
+	handle = dlopen("libpthread.so", RTLD_LAZY);
+	if(handle == NULL) {
+		fprintf(stderr, "dlopen: failed to load libpthread: %s\n", dlerror());
+		abort();
+	}
+	sys_pthread_create = dlsym(handle, "pthread_create");
+	if(sys_pthread_create == NULL) {
+		fprintf(stderr, "dlsym: failed to find pthread_create: %s\n", dlerror());
+		abort();
+	}
+	dlclose(handle);
+
+	tcb_fixup(1);
+}
+
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	sigprocmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+
+	ts->g->stackguard = size;
+	err = sys_pthread_create(&p, &attr, threadentry, ts);
+
+	sigprocmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	tcb_fixup(0);
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	ts.g->stackbase = (uintptr)&ts;
+
+	/*
+	 * _cgo_sys_thread_start set stackguard to stack size;
+	 * change to actual guard pointer.
+	 */
+	ts.g->stackguard = (uintptr)&ts - ts.g->stackguard + 4096;
+
+	/*
+	 * Set specific keys.  On OpenBSD/ELF, the thread local storage
+	 * is just before %fs:0.  Our dynamic 6.out's reserve 16 bytes
+	 * for the two words g and m at %fs:-16 and %fs:-8.
+	 */
+	asm volatile (
+		"movq %0, %%fs:-16\n"	// MOVL g, -16(FS)
+		"movq %1, %%fs:-8\n"	// MOVL m, -8(FS)
+		:: "r"(ts.g), "r"(ts.m)
+	);
+	crosscall_amd64(ts.fn);
+	return nil;
+}
diff --git a/src/pkg/runtime/cgo/gcc_setenv.c b/src/pkg/runtime/cgo/gcc_setenv.c
index 7da4ad915..a0938166d 100644
--- a/src/pkg/runtime/cgo/gcc_setenv.c
+++ b/src/pkg/runtime/cgo/gcc_setenv.c
@@ -1,4 +1,4 @@
-// Copyright 20111 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -9,10 +9,8 @@
 #include <stdlib.h>
 
 /* Stub for calling setenv */
-static void
-xlibcgo_setenv(char **arg)
+void
+x_cgo_setenv(char **arg)
 {
 	setenv(arg[0], arg[1], 1);
 }
-
-void (*libcgo_setenv)(char**) = xlibcgo_setenv;
diff --git a/src/pkg/runtime/cgo/gcc_util.c b/src/pkg/runtime/cgo/gcc_util.c
index e06b6f64d..20913d736 100644
--- a/src/pkg/runtime/cgo/gcc_util.c
+++ b/src/pkg/runtime/cgo/gcc_util.c
@@ -5,7 +5,7 @@
 #include "libcgo.h"
 
 /* Stub for calling malloc from Go */
-static void
+void
 x_cgo_malloc(void *p)
 {
 	struct a {
@@ -16,10 +16,8 @@ x_cgo_malloc(void *p)
 	a->ret = malloc(a->n);
 }
 
-void (*_cgo_malloc)(void*) = x_cgo_malloc;
-
 /* Stub for calling free from Go */
-static void
+void
 x_cgo_free(void *p)
 {
 	struct a {
@@ -29,11 +27,9 @@ x_cgo_free(void *p)
 	free(a->arg);
 }
 
-void (*_cgo_free)(void*) = x_cgo_free;
-
 /* Stub for creating a new thread */
-static void
-xlibcgo_thread_start(ThreadStart *arg)
+void
+x_cgo_thread_start(ThreadStart *arg)
 {
 	ThreadStart *ts;
 
@@ -45,7 +41,5 @@ xlibcgo_thread_start(ThreadStart *arg)
 	}
 	*ts = *arg;
 
-	libcgo_sys_thread_start(ts);	/* OS-dependent half */
+	_cgo_sys_thread_start(ts);	/* OS-dependent half */
 }
-
-void (*libcgo_thread_start)(ThreadStart*) = xlibcgo_thread_start;
diff --git a/src/pkg/runtime/cgo/gcc_windows_386.c b/src/pkg/runtime/cgo/gcc_windows_386.c
index 2b940d362..02eab12e5 100644
--- a/src/pkg/runtime/cgo/gcc_windows_386.c
+++ b/src/pkg/runtime/cgo/gcc_windows_386.c
@@ -4,31 +4,31 @@
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
+#include <process.h>
 #include "libcgo.h"
 
-static void *threadentry(void*);
+static void threadentry(void*);
 
 /* 1MB is default stack size for 32-bit Windows.
    Allocation granularity on Windows is typically 64 KB.
    The constant is also hardcoded in cmd/ld/pe.c (keep synchronized). */
 #define STACKSIZE (1*1024*1024)
 
-static void
-xinitcgo(G *g)
+void
+x_cgo_init(G *g)
 {
 	int tmp;
 	g->stackguard = (uintptr)&tmp - STACKSIZE + 8*1024;
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	_beginthread(threadentry, 0, ts);
 }
 
-static void*
+static void
 threadentry(void *v)
 {
 	ThreadStart ts;
@@ -55,5 +55,4 @@ threadentry(void *v)
 	crosscall_386(ts.fn);
 	
 	LocalFree(tls0);
-	return nil;
 }
diff --git a/src/pkg/runtime/cgo/gcc_windows_amd64.c b/src/pkg/runtime/cgo/gcc_windows_amd64.c
index 0d2f5d233..f7695a1cc 100644
--- a/src/pkg/runtime/cgo/gcc_windows_amd64.c
+++ b/src/pkg/runtime/cgo/gcc_windows_amd64.c
@@ -4,31 +4,31 @@
 
 #define WIN64_LEAN_AND_MEAN
 #include <windows.h>
+#include <process.h>
 #include "libcgo.h"
 
-static void *threadentry(void*);
+static void threadentry(void*);
 
 /* 2MB is default stack size for 64-bit Windows.
    Allocation granularity on Windows is typically 64 KB.
    The constant is also hardcoded in cmd/ld/pe.c (keep synchronized). */
 #define STACKSIZE (2*1024*1024)
 
-static void
-xinitcgo(G *g)
+void
+x_cgo_init(G *g)
 {
 	int tmp;
 	g->stackguard = (uintptr)&tmp - STACKSIZE + 8*1024;
 }
 
-void (*initcgo)(G*) = xinitcgo;
 
 void
-libcgo_sys_thread_start(ThreadStart *ts)
+_cgo_sys_thread_start(ThreadStart *ts)
 {
 	_beginthread(threadentry, 0, ts);
 }
 
-static void*
+static void
 threadentry(void *v)
 {
 	ThreadStart ts;
@@ -53,5 +53,4 @@ threadentry(void *v)
 	);
 
 	crosscall_amd64(ts.fn);
-	return nil;
 }
diff --git a/src/pkg/runtime/cgo/libcgo.h b/src/pkg/runtime/cgo/libcgo.h
index c31d19d76..41a371c27 100644
--- a/src/pkg/runtime/cgo/libcgo.h
+++ b/src/pkg/runtime/cgo/libcgo.h
@@ -26,7 +26,7 @@ struct G
 };
 
 /*
- * Arguments to the libcgo_thread_start call.
+ * Arguments to the _cgo_thread_start call.
  * Also known to ../pkg/runtime/runtime.h.
  */
 typedef struct ThreadStart ThreadStart;
@@ -40,14 +40,14 @@ struct ThreadStart
 /*
  * Called by 5c/6c/8c world.
  * Makes a local copy of the ThreadStart and
- * calls libcgo_sys_thread_start(ts).
+ * calls _cgo_sys_thread_start(ts).
  */
-extern void (*libcgo_thread_start)(ThreadStart *ts);
+extern void (*_cgo_thread_start)(ThreadStart *ts);
 
 /*
  * Creates the new operating system thread (OS, arch dependent).
  */
-void libcgo_sys_thread_start(ThreadStart *ts);
+void _cgo_sys_thread_start(ThreadStart *ts);
 
 /*
  * Call fn in the 6c world.
diff --git a/src/pkg/runtime/cgo/netbsd.c b/src/pkg/runtime/cgo/netbsd.c
new file mode 100644
index 000000000..b6403f686
--- /dev/null
+++ b/src/pkg/runtime/cgo/netbsd.c
@@ -0,0 +1,13 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Supply environ and __progname, because we don't
+// link against the standard NetBSD crt0.o and the
+// libc dynamic library needs them.
+
+char *environ[1];
+char *__progname;
+
+#pragma dynexport environ environ
+#pragma dynexport __progname __progname
diff --git a/src/pkg/runtime/cgo/openbsd.c b/src/pkg/runtime/cgo/openbsd.c
new file mode 100644
index 000000000..84e9f9eff
--- /dev/null
+++ b/src/pkg/runtime/cgo/openbsd.c
@@ -0,0 +1,21 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Supply environ, __progname and __guard_local, because
+// we don't link against the standard OpenBSD crt0.o and
+// the libc dynamic library needs them.
+
+char *environ[1];
+char *__progname;
+long __guard_local;
+
+#pragma dynexport environ environ
+#pragma dynexport __progname __progname
+
+// This is normally marked as hidden and placed in the
+// .openbsd.randomdata section.
+#pragma dynexport __guard_local __guard_local
+
+// We override pthread_create to support PT_TLS.
+#pragma dynexport pthread_create pthread_create
diff --git a/src/pkg/runtime/cgo/setenv.c b/src/pkg/runtime/cgo/setenv.c
new file mode 100644
index 000000000..4c47cdb00
--- /dev/null
+++ b/src/pkg/runtime/cgo/setenv.c
@@ -0,0 +1,10 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin freebsd linux netbsd openbsd
+
+#pragma cgo_import_static x_cgo_setenv
+
+void x_cgo_setenv(char**);
+void (*_cgo_setenv)(char**) = x_cgo_setenv;
diff --git a/src/pkg/runtime/cgocall.c b/src/pkg/runtime/cgocall.c
index 7a26538ec..590bf9b67 100644
--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -6,6 +6,7 @@
 #include "arch_GOARCH.h"
 #include "stack.h"
 #include "cgocall.h"
+#include "race.h"
 
 // Cgo call and callback support.
 //
@@ -41,7 +42,7 @@
 // know about packages).  The gcc-compiled C function f calls GoF.
 //
 // GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
-// (in cgo/$GOOS.S, a gcc-compiled assembly file) is a two-argument
+// (in cgo/gcc_$GOARCH.S, a gcc-compiled assembly file) is a two-argument
 // adapter from the gcc function call ABI to the 6c function call ABI.
 // It is called from gcc to call 6c functions.  In this case it calls
 // _cgoexp_GoF(frame, framesize), still running on m->g0's stack
@@ -82,43 +83,55 @@
 // _cgoexp_GoF immediately returns to crosscall2, which restores the
 // callee-save registers for gcc and returns to GoF, which returns to f.
 
-void *initcgo;	/* filled in by dynamic linker when Cgo is available */
+void *_cgo_init;	/* filled in by dynamic linker when Cgo is available */
+static int64 cgosync;  /* represents possible synchronization in C code */
+
+// These two are only used by the architecture where TLS based storage isn't
+// the default for g and m (e.g., ARM)
+void *_cgo_load_gm; /* filled in by dynamic linker when Cgo is available */
+void *_cgo_save_gm; /* filled in by dynamic linker when Cgo is available */
 
-static void unlockm(void);
 static void unwindm(void);
 
 // Call from Go to C.
 
+static FuncVal unlockOSThread = { runtime·unlockOSThread };
+
 void
 runtime·cgocall(void (*fn)(void*), void *arg)
 {
 	Defer d;
 
+	if(m->racecall) {
+		runtime·asmcgocall(fn, arg);
+		return;
+	}
+
 	if(!runtime·iscgo && !Windows)
 		runtime·throw("cgocall unavailable");
 
 	if(fn == 0)
 		runtime·throw("cgocall nil");
 
+	if(raceenabled)
+		runtime·racereleasemerge(&cgosync);
+
 	m->ncgocall++;
 
 	/*
 	 * Lock g to m to ensure we stay on the same stack if we do a
-	 * cgo callback.
+	 * cgo callback. Add entry to defer stack in case of panic.
 	 */
-	d.nofree = false;
-	if(m->lockedg == nil) {
-		m->lockedg = g;
-		g->lockedm = m;
-
-		// Add entry to defer stack in case of panic.
-		d.fn = (byte*)unlockm;
-		d.siz = 0;
-		d.link = g->defer;
-		d.argp = (void*)-1;  // unused because unlockm never recovers
-		d.nofree = true;
-		g->defer = &d;
-	}
+	runtime·lockOSThread();
+	d.fn = &unlockOSThread;
+	d.siz = 0;
+	d.link = g->defer;
+	d.argp = (void*)-1;  // unused because unlockm never recovers
+	d.special = true;
+	d.free = false;
+	g->defer = &d;
+
+	m->ncgo++;
 
 	/*
 	 * Announce we are entering a system call
@@ -135,29 +148,31 @@ runtime·cgocall(void (*fn)(void*), void *arg)
 	runtime·asmcgocall(fn, arg);
 	runtime·exitsyscall();
 
-	if(d.nofree) {
-		if(g->defer != &d || d.fn != (byte*)unlockm)
-			runtime·throw("runtime: bad defer entry in cgocallback");
-		g->defer = d.link;
-		unlockm();
+	m->ncgo--;
+	if(m->ncgo == 0) {
+		// We are going back to Go and are not in a recursive
+		// call.  Let the GC collect any memory allocated via
+		// _cgo_allocate that is no longer referenced.
+		m->cgomal = nil;
 	}
-}
 
-static void
-unlockm(void)
-{
-	m->lockedg = nil;
-	g->lockedm = nil;
+	if(g->defer != &d || d.fn != &unlockOSThread)
+		runtime·throw("runtime: bad defer entry in cgocallback");
+	g->defer = d.link;
+	runtime·unlockOSThread();
+
+	if(raceenabled)
+		runtime·raceacquire(&cgosync);
 }
 
 void
 runtime·NumCgoCall(int64 ret)
 {
-	M *m;
+	M *mp;
 
 	ret = 0;
-	for(m=runtime·atomicloadp(&runtime·allm); m; m=m->alllink)
-		ret += m->ncgocall;
+	for(mp=runtime·atomicloadp(&runtime·allm); mp; mp=mp->alllink)
+		ret += mp->ncgocall;
 	FLUSH(&ret);
 }
 
@@ -188,31 +203,50 @@ runtime·cfree(void *p)
 
 // Call from C back to Go.
 
+static FuncVal unwindmf = {unwindm};
+
 void
-runtime·cgocallbackg(void (*fn)(void), void *arg, uintptr argsize)
+runtime·cgocallbackg(FuncVal *fn, void *arg, uintptr argsize)
 {
 	Defer d;
 
+	if(m->racecall) {
+		reflect·call(fn, arg, argsize);
+		return;
+	}
+
 	if(g != m->curg)
 		runtime·throw("runtime: bad g in cgocallback");
 
 	runtime·exitsyscall();	// coming out of cgo call
 
+	if(m->needextram) {
+		m->needextram = 0;
+		runtime·newextram();
+	}
+
 	// Add entry to defer stack in case of panic.
-	d.fn = (byte*)unwindm;
+	d.fn = &unwindmf;
 	d.siz = 0;
 	d.link = g->defer;
 	d.argp = (void*)-1;  // unused because unwindm never recovers
-	d.nofree = true;
+	d.special = true;
+	d.free = false;
 	g->defer = &d;
 
+	if(raceenabled)
+		runtime·raceacquire(&cgosync);
+
 	// Invoke callback.
-	reflect·call((byte*)fn, arg, argsize);
+	reflect·call(fn, arg, argsize);
+
+	if(raceenabled)
+		runtime·racereleasemerge(&cgosync);
 
 	// Pop defer.
 	// Do not unwind m->g0->sched.sp.
 	// Our caller, cgocallback, will do that.
-	if(g->defer != &d || d.fn != (byte*)unwindm)
+	if(g->defer != &d || d.fn != &unwindmf)
 		runtime·throw("runtime: bad defer entry in cgocallback");
 	g->defer = d.link;
 
@@ -229,7 +263,8 @@ unwindm(void)
 		runtime·throw("runtime: unwindm not implemented");
 	case '8':
 	case '6':
-		m->g0->sched.sp = *(void**)m->g0->sched.sp;
+	case '5':
+		m->g0->sched.sp = *(uintptr*)m->g0->sched.sp;
 		break;
 	}
 }
diff --git a/src/pkg/runtime/chan.c b/src/pkg/runtime/chan.c
index ef27144ef..32995c6dd 100644
--- a/src/pkg/runtime/chan.c
+++ b/src/pkg/runtime/chan.c
@@ -3,7 +3,10 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "arch_GOARCH.h"
 #include "type.h"
+#include "race.h"
+#include "malloc.h"
 
 #define	MAXALIGN	7
 #define	NOSELGEN	1
@@ -20,6 +23,7 @@ struct	SudoG
 	G*	g;		// g and selgen constitute
 	uint32	selgen;		// a weak pointer to g
 	SudoG*	link;
+	int64	releasetime;
 	byte*	elem;		// data element
 };
 
@@ -29,21 +33,25 @@ struct	WaitQ
 	SudoG*	last;
 };
 
+// The garbage collector is assuming that Hchan can only contain pointers into the stack
+// and cannot contain pointers into the heap.
 struct	Hchan
 {
-	uint32	qcount;			// total data in the q
-	uint32	dataqsiz;		// size of the circular q
+	uintgo	qcount;			// total data in the q
+	uintgo	dataqsiz;		// size of the circular q
 	uint16	elemsize;
 	bool	closed;
 	uint8	elemalign;
 	Alg*	elemalg;		// interface for element type
-	uint32	sendx;			// send index
-	uint32	recvx;			// receive index
+	uintgo	sendx;			// send index
+	uintgo	recvx;			// receive index
 	WaitQ	recvq;			// list of recv waiters
 	WaitQ	sendq;			// list of send waiters
 	Lock;
 };
 
+uint32 runtime·Hchansize = sizeof(Hchan);
+
 // Buffer follows Hchan immediately in memory.
 // chanbuf(c, i) is pointer to the i'th slot in the buffer.
 #define chanbuf(c, i) ((byte*)((c)+1)+(uintptr)(c)->elemsize*(i))
@@ -79,17 +87,22 @@ static	void	dequeueg(WaitQ*);
 static	SudoG*	dequeue(WaitQ*);
 static	void	enqueue(WaitQ*, SudoG*);
 static	void	destroychan(Hchan*);
+static	void	racesync(Hchan*, SudoG*);
 
 Hchan*
 runtime·makechan_c(ChanType *t, int64 hint)
 {
 	Hchan *c;
-	int32 n;
+	uintptr n;
 	Type *elem;
-	
+
 	elem = t->elem;
 
-	if(hint < 0 || (int32)hint != hint || (elem->size > 0 && hint > ((uintptr)-1) / elem->size))
+	// compiler checks this but be safe.
+	if(elem->size >= (1<<16))
+		runtime·throw("makechan: invalid channel element type");
+
+	if(hint < 0 || (intgo)hint != hint || (elem->size > 0 && hint > MaxMem / elem->size))
 		runtime·panicstring("makechan: size out of range");
 
 	// calculate rounded size of Hchan
@@ -103,18 +116,19 @@ runtime·makechan_c(ChanType *t, int64 hint)
 	c->elemalg = elem->alg;
 	c->elemalign = elem->align;
 	c->dataqsiz = hint;
+	runtime·settype(c, (uintptr)t | TypeInfo_Chan);
 
 	if(debug)
-		runtime·printf("makechan: chan=%p; elemsize=%D; elemalg=%p; elemalign=%d; dataqsiz=%d\n",
-			c, (int64)elem->size, elem->alg, elem->align, c->dataqsiz);
+		runtime·printf("makechan: chan=%p; elemsize=%D; elemalg=%p; elemalign=%d; dataqsiz=%D\n",
+			c, (int64)elem->size, elem->alg, elem->align, (int64)c->dataqsiz);
 
 	return c;
 }
 
 // For reflect
-//	func makechan(typ *ChanType, size uint32) (chan)
+//	func makechan(typ *ChanType, size uint64) (chan)
 void
-reflect·makechan(ChanType *t, uint32 size, Hchan *c)
+reflect·makechan(ChanType *t, uint64 size, Hchan *c)
 {
 	c = runtime·makechan_c(t, size);
 	FLUSH(&c);
@@ -143,11 +157,12 @@ runtime·makechan(ChanType *t, int64 hint, Hchan *ret)
  * the operation; we'll see that it's now closed.
  */
 void
-runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
+runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres, void *pc)
 {
 	SudoG *sg;
 	SudoG mysg;
 	G* gp;
+	int64 t0;
 
 	if(c == nil) {
 		USED(t);
@@ -155,9 +170,7 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 			*pres = false;
 			return;
 		}
-		g->status = Gwaiting;
-		g->waitreason = "chan send (nil chan)";
-		runtime·gosched();
+		runtime·park(nil, nil, "chan send (nil chan)");
 		return;  // not reached
 	}
 
@@ -170,7 +183,17 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 		runtime·prints("\n");
 	}
 
+	t0 = 0;
+	mysg.releasetime = 0;
+	if(runtime·blockprofilerate > 0) {
+		t0 = runtime·cputicks();
+		mysg.releasetime = -1;
+	}
+
 	runtime·lock(c);
+	// TODO(dvyukov): add similar instrumentation to select.
+	if(raceenabled)
+		runtime·racereadpc(c, pc, runtime·chansend);
 	if(c->closed)
 		goto closed;
 
@@ -179,12 +202,16 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 
 	sg = dequeue(&c->recvq);
 	if(sg != nil) {
+		if(raceenabled)
+			racesync(c, sg);
 		runtime·unlock(c);
-		
+
 		gp = sg->g;
 		gp->param = sg;
 		if(sg->elem != nil)
 			c->elemalg->copy(c->elemsize, sg->elem, ep);
+		if(sg->releasetime)
+			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 
 		if(pres != nil)
@@ -202,11 +229,8 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 	mysg.g = g;
 	mysg.selgen = NOSELGEN;
 	g->param = nil;
-	g->status = Gwaiting;
-	g->waitreason = "chan send";
 	enqueue(&c->sendq, &mysg);
-	runtime·unlock(c);
-	runtime·gosched();
+	runtime·park(runtime·unlock, c, "chan send");
 
 	if(g->param == nil) {
 		runtime·lock(c);
@@ -215,6 +239,9 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 		goto closed;
 	}
 
+	if(mysg.releasetime > 0)
+		runtime·blockevent(mysg.releasetime - t0, 2);
+
 	return;
 
 asynch:
@@ -230,15 +257,16 @@ asynch:
 		mysg.g = g;
 		mysg.elem = nil;
 		mysg.selgen = NOSELGEN;
-		g->status = Gwaiting;
-		g->waitreason = "chan send";
 		enqueue(&c->sendq, &mysg);
-		runtime·unlock(c);
-		runtime·gosched();
+		runtime·park(runtime·unlock, c, "chan send");
 
 		runtime·lock(c);
 		goto asynch;
 	}
+
+	if(raceenabled)
+		runtime·racerelease(chanbuf(c, c->sendx));
+
 	c->elemalg->copy(c->elemsize, chanbuf(c, c->sendx), ep);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
@@ -248,11 +276,15 @@ asynch:
 	if(sg != nil) {
 		gp = sg->g;
 		runtime·unlock(c);
+		if(sg->releasetime)
+			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 	} else
 		runtime·unlock(c);
 	if(pres != nil)
 		*pres = true;
+	if(mysg.releasetime > 0)
+		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 
 closed:
@@ -267,6 +299,7 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 	SudoG *sg;
 	SudoG mysg;
 	G *gp;
+	int64 t0;
 
 	if(runtime·gcwaiting)
 		runtime·gosched();
@@ -280,12 +313,17 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 			*selected = false;
 			return;
 		}
-		g->status = Gwaiting;
-		g->waitreason = "chan receive (nil chan)";
-		runtime·gosched();
+		runtime·park(nil, nil, "chan receive (nil chan)");
 		return;  // not reached
 	}
 
+	t0 = 0;
+	mysg.releasetime = 0;
+	if(runtime·blockprofilerate > 0) {
+		t0 = runtime·cputicks();
+		mysg.releasetime = -1;
+	}
+
 	runtime·lock(c);
 	if(c->dataqsiz > 0)
 		goto asynch;
@@ -295,12 +333,16 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 
 	sg = dequeue(&c->sendq);
 	if(sg != nil) {
+		if(raceenabled)
+			racesync(c, sg);
 		runtime·unlock(c);
 
 		if(ep != nil)
 			c->elemalg->copy(c->elemsize, ep, sg->elem);
 		gp = sg->g;
 		gp->param = sg;
+		if(sg->releasetime)
+			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 
 		if(selected != nil)
@@ -320,11 +362,8 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 	mysg.g = g;
 	mysg.selgen = NOSELGEN;
 	g->param = nil;
-	g->status = Gwaiting;
-	g->waitreason = "chan receive";
 	enqueue(&c->recvq, &mysg);
-	runtime·unlock(c);
-	runtime·gosched();
+	runtime·park(runtime·unlock, c, "chan receive");
 
 	if(g->param == nil) {
 		runtime·lock(c);
@@ -335,6 +374,8 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 
 	if(received != nil)
 		*received = true;
+	if(mysg.releasetime > 0)
+		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 
 asynch:
@@ -352,15 +393,16 @@ asynch:
 		mysg.g = g;
 		mysg.elem = nil;
 		mysg.selgen = NOSELGEN;
-		g->status = Gwaiting;
-		g->waitreason = "chan receive";
 		enqueue(&c->recvq, &mysg);
-		runtime·unlock(c);
-		runtime·gosched();
+		runtime·park(runtime·unlock, c, "chan receive");
 
 		runtime·lock(c);
 		goto asynch;
 	}
+
+	if(raceenabled)
+		runtime·raceacquire(chanbuf(c, c->recvx));
+
 	if(ep != nil)
 		c->elemalg->copy(c->elemsize, ep, chanbuf(c, c->recvx));
 	c->elemalg->copy(c->elemsize, chanbuf(c, c->recvx), nil);
@@ -372,6 +414,8 @@ asynch:
 	if(sg != nil) {
 		gp = sg->g;
 		runtime·unlock(c);
+		if(sg->releasetime)
+			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 	} else
 		runtime·unlock(c);
@@ -380,6 +424,8 @@ asynch:
 		*selected = true;
 	if(received != nil)
 		*received = true;
+	if(mysg.releasetime > 0)
+		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 
 closed:
@@ -389,7 +435,11 @@ closed:
 		*selected = true;
 	if(received != nil)
 		*received = false;
+	if(raceenabled)
+		runtime·raceacquire(c);
 	runtime·unlock(c);
+	if(mysg.releasetime > 0)
+		runtime·blockevent(mysg.releasetime - t0, 2);
 }
 
 // chansend1(hchan *chan any, elem any);
@@ -397,7 +447,7 @@ closed:
 void
 runtime·chansend1(ChanType *t, Hchan* c, ...)
 {
-	runtime·chansend(t, c, (byte*)(&c+1), nil);
+	runtime·chansend(t, c, (byte*)(&c+1), nil, runtime·getcallerpc(&t));
 }
 
 // chanrecv1(hchan *chan any) (elem any);
@@ -446,8 +496,8 @@ runtime·selectnbsend(ChanType *t, Hchan *c, ...)
 	byte *ae, *ap;
 
 	ae = (byte*)(&c + 1);
-	ap = ae + runtime·rnd(t->elem->size, Structrnd);
-	runtime·chansend(t, c, ae, ap);
+	ap = ae + ROUND(t->elem->size, Structrnd);
+	runtime·chansend(t, c, ae, ap, runtime·getcallerpc(&t));
 }
 
 // func selectnbrecv(elem *any, c chan any) bool
@@ -474,7 +524,7 @@ void
 runtime·selectnbrecv(ChanType *t, byte *v, Hchan *c, bool selected)
 {
 	runtime·chanrecv(t, c, v, &selected, nil);
-}	
+}
 
 // func selectnbrecv2(elem *any, ok *bool, c chan any) bool
 //
@@ -500,7 +550,7 @@ void
 runtime·selectnbrecv2(ChanType *t, byte *v, bool *received, Hchan *c, bool selected)
 {
 	runtime·chanrecv(t, c, v, &selected, received);
-}	
+}
 
 // For reflect:
 //	func chansend(c chan, val iword, nb bool) (selected bool)
@@ -509,12 +559,13 @@ runtime·selectnbrecv2(ChanType *t, byte *v, bool *received, Hchan *c, bool sele
 //
 // The "uintptr selected" is really "bool selected" but saying
 // uintptr gets us the right alignment for the output parameter block.
+#pragma textflag 7
 void
 reflect·chansend(ChanType *t, Hchan *c, uintptr val, bool nb, uintptr selected)
 {
 	bool *sp;
 	byte *vp;
-	
+
 	if(nb) {
 		selected = false;
 		sp = (bool*)&selected;
@@ -527,7 +578,7 @@ reflect·chansend(ChanType *t, Hchan *c, uintptr val, bool nb, uintptr selected)
 		vp = (byte*)&val;
 	else
 		vp = (byte*)val;
-	runtime·chansend(t, c, vp, sp);
+	runtime·chansend(t, c, vp, sp, runtime·getcallerpc(&t));
 }
 
 // For reflect:
@@ -571,7 +622,7 @@ runtime·newselect(int32 size, ...)
 	int32 o;
 	Select **selp;
 
-	o = runtime·rnd(sizeof(size), Structrnd);
+	o = ROUND(sizeof(size), Structrnd);
 	selp = (Select**)((byte*)&size + o);
 	newselect(size, selp);
 }
@@ -619,7 +670,7 @@ runtime·selectsend(Select *sel, Hchan *c, void *elem, bool selected)
 	// nil cases do not compete
 	if(c == nil)
 		return;
-	
+
 	selectsend(sel, c, runtime·getcallerpc(&sel), elem, (byte*)&selected - (byte*)&sel);
 }
 
@@ -628,7 +679,7 @@ selectsend(Select *sel, Hchan *c, void *pc, void *elem, int32 so)
 {
 	int32 i;
 	Scase *cas;
-	
+
 	i = sel->ncase;
 	if(i >= sel->tcase)
 		runtime·throw("selectsend: too many cases");
@@ -774,9 +825,7 @@ selunlock(Select *sel)
 void
 runtime·block(void)
 {
-	g->status = Gwaiting;	// forever
-	g->waitreason = "select (no cases)";
-	runtime·gosched();
+	runtime·park(nil, nil, "select (no cases)");	// forever
 }
 
 static void* selectgo(Select**);
@@ -796,7 +845,7 @@ static void*
 selectgo(Select **selp)
 {
 	Select *sel;
-	uint32 o, i, j;
+	uint32 o, i, j, k;
 	Scase *cas, *dfl;
 	Hchan *c;
 	SudoG *sg;
@@ -830,12 +879,42 @@ selectgo(Select **selp)
 	}
 
 	// sort the cases by Hchan address to get the locking order.
+	// simple heap sort, to guarantee n log n time and constant stack footprint.
 	for(i=0; i<sel->ncase; i++) {
-		c = sel->scase[i].chan;
-		for(j=i; j>0 && sel->lockorder[j-1] >= c; j--)
-			sel->lockorder[j] = sel->lockorder[j-1];
+		j = i;
+		c = sel->scase[j].chan;
+		while(j > 0 && sel->lockorder[k=(j-1)/2] < c) {
+			sel->lockorder[j] = sel->lockorder[k];
+			j = k;
+		}
+		sel->lockorder[j] = c;
+	}
+	for(i=sel->ncase; i-->0; ) {
+		c = sel->lockorder[i];
+		sel->lockorder[i] = sel->lockorder[0];
+		j = 0;
+		for(;;) {
+			k = j*2+1;
+			if(k >= i)
+				break;
+			if(k+1 < i && sel->lockorder[k] < sel->lockorder[k+1])
+				k++;
+			if(c < sel->lockorder[k]) {
+				sel->lockorder[j] = sel->lockorder[k];
+				j = k;
+				continue;
+			}
+			break;
+		}
 		sel->lockorder[j] = c;
 	}
+	/*
+	for(i=0; i+1<sel->ncase; i++)
+		if(sel->lockorder[i] > sel->lockorder[i+1]) {
+			runtime·printf("i=%d %p %p\n", i, sel->lockorder[i], sel->lockorder[i+1]);
+			runtime·throw("select: broken sort");
+		}
+	*/
 	sellock(sel);
 
 loop:
@@ -899,7 +978,7 @@ loop:
 		case CaseRecv:
 			enqueue(&c->recvq, sg);
 			break;
-		
+
 		case CaseSend:
 			enqueue(&c->sendq, sg);
 			break;
@@ -907,10 +986,7 @@ loop:
 	}
 
 	g->param = nil;
-	g->status = Gwaiting;
-	g->waitreason = "select";
-	selunlock(sel);
-	runtime·gosched();
+	runtime·park((void(*)(Lock*))selunlock, (Lock*)sel, "select");
 
 	sellock(sel);
 	sg = g->param;
@@ -951,6 +1027,8 @@ loop:
 
 asyncrecv:
 	// can receive from buffer
+	if(raceenabled)
+		runtime·raceacquire(chanbuf(c, c->recvx));
 	if(cas->receivedp != nil)
 		*cas->receivedp = true;
 	if(cas->sg.elem != nil)
@@ -971,6 +1049,8 @@ asyncrecv:
 
 asyncsend:
 	// can send to buffer
+	if(raceenabled)
+		runtime·racerelease(chanbuf(c, c->sendx));
 	c->elemalg->copy(c->elemsize, chanbuf(c, c->sendx), cas->sg.elem);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
@@ -987,6 +1067,8 @@ asyncsend:
 
 syncrecv:
 	// can receive from sleeping sender (sg)
+	if(raceenabled)
+		racesync(c, sg);
 	selunlock(sel);
 	if(debug)
 		runtime·printf("syncrecv: sel=%p c=%p o=%d\n", sel, c, o);
@@ -1006,10 +1088,14 @@ rclose:
 		*cas->receivedp = false;
 	if(cas->sg.elem != nil)
 		c->elemalg->copy(c->elemsize, cas->sg.elem, nil);
+	if(raceenabled)
+		runtime·raceacquire(c);
 	goto retc;
 
 syncsend:
 	// can send to sleeping receiver (sg)
+	if(raceenabled)
+		racesync(c, sg);
 	selunlock(sel);
 	if(debug)
 		runtime·printf("syncsend: sel=%p c=%p o=%d\n", sel, c, o);
@@ -1020,11 +1106,17 @@ syncsend:
 	runtime·ready(gp);
 
 retc:
-	// return to pc corresponding to chosen case
+	// return pc corresponding to chosen case.
+	// Set boolean passed during select creation
+	// (at offset selp + cas->so) to true.
+	// If cas->so == 0, this is a reflect-driven select and we
+	// don't need to update the boolean.
 	pc = cas->pc;
-	as = (byte*)selp + cas->so;
+	if(cas->so > 0) {
+		as = (byte*)selp + cas->so;
+		*as = true;
+	}
 	runtime·free(sel);
-	*as = true;
 	return pc;
 
 sclose:
@@ -1034,7 +1126,89 @@ sclose:
 	return nil;  // not reached
 }
 
+// This struct must match ../reflect/value.go:/runtimeSelect.
+typedef struct runtimeSelect runtimeSelect;
+struct runtimeSelect
+{
+	uintptr dir;
+	ChanType *typ;
+	Hchan *ch;
+	uintptr val;
+};
+
+// This enum must match ../reflect/value.go:/SelectDir.
+enum SelectDir {
+	SelectSend = 1,
+	SelectRecv,
+	SelectDefault,
+};
+
+// func rselect(cases []runtimeSelect) (chosen int, word uintptr, recvOK bool)
+void
+reflect·rselect(Slice cases, intgo chosen, uintptr word, bool recvOK)
+{
+	int32 i;
+	Select *sel;
+	runtimeSelect* rcase, *rc;
+	void *elem;
+	void *recvptr;
+	uintptr maxsize;
+
+	chosen = -1;
+	word = 0;
+	recvOK = false;
+
+	maxsize = 0;
+	rcase = (runtimeSelect*)cases.array;
+	for(i=0; i<cases.len; i++) {
+		rc = &rcase[i];
+		if(rc->dir == SelectRecv && rc->ch != nil && maxsize < rc->typ->elem->size)
+			maxsize = rc->typ->elem->size;
+	}
+
+	recvptr = nil;
+	if(maxsize > sizeof(void*))
+		recvptr = runtime·mal(maxsize);
+
+	newselect(cases.len, &sel);
+	for(i=0; i<cases.len; i++) {
+		rc = &rcase[i];
+		switch(rc->dir) {
+		case SelectDefault:
+			selectdefault(sel, (void*)i, 0);
+			break;
+		case SelectSend:
+			if(rc->ch == nil)
+				break;
+			if(rc->typ->elem->size > sizeof(void*))
+				elem = (void*)rc->val;
+			else
+				elem = (void*)&rc->val;
+			selectsend(sel, rc->ch, (void*)i, elem, 0);
+			break;
+		case SelectRecv:
+			if(rc->ch == nil)
+				break;
+			if(rc->typ->elem->size > sizeof(void*))
+				elem = recvptr;
+			else
+				elem = &word;
+			selectrecv(sel, rc->ch, (void*)i, elem, &recvOK, 0);
+			break;
+		}
+	}
+
+	chosen = (intgo)(uintptr)selectgo(&sel);
+	if(rcase[chosen].dir == SelectRecv && rcase[chosen].typ->elem->size > sizeof(void*))
+		word = (uintptr)recvptr;
+
+	FLUSH(&chosen);
+	FLUSH(&word);
+	FLUSH(&recvOK);
+}
+
 // closechan(sel *byte);
+#pragma textflag 7
 void
 runtime·closechan(Hchan *c)
 {
@@ -1053,6 +1227,11 @@ runtime·closechan(Hchan *c)
 		runtime·panicstring("close of closed channel");
 	}
 
+	if(raceenabled) {
+		runtime·racewritepc(c, runtime·getcallerpc(&c), runtime·closechan);
+		runtime·racerelease(c);
+	}
+
 	c->closed = true;
 
 	// release all readers
@@ -1087,9 +1266,9 @@ reflect·chanclose(Hchan *c)
 }
 
 // For reflect
-//	func chanlen(c chan) (len int32)
+//	func chanlen(c chan) (len int)
 void
-reflect·chanlen(Hchan *c, int32 len)
+reflect·chanlen(Hchan *c, intgo len)
 {
 	if(c == nil)
 		len = 0;
@@ -1099,9 +1278,9 @@ reflect·chanlen(Hchan *c, int32 len)
 }
 
 // For reflect
-//	func chancap(c chan) (cap int32)
+//	func chancap(c chan) int
 void
-reflect·chancap(Hchan *c, int32 cap)
+reflect·chancap(Hchan *c, intgo cap)
 {
 	if(c == nil)
 		cap = 0;
@@ -1160,3 +1339,12 @@ enqueue(WaitQ *q, SudoG *sgp)
 	q->last->link = sgp;
 	q->last = sgp;
 }
+
+static void
+racesync(Hchan *c, SudoG *sg)
+{
+	runtime·racerelease(chanbuf(c, 0));
+	runtime·raceacquireg(sg->g, chanbuf(c, 0));
+	runtime·racereleaseg(sg->g, chanbuf(c, 0));
+	runtime·raceacquire(chanbuf(c, 0));
+}
diff --git a/src/pkg/runtime/closure_386.c b/src/pkg/runtime/closure_386.c
deleted file mode 100644
index b4d867711..000000000
--- a/src/pkg/runtime/closure_386.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-
-#pragma textflag 7
-// func closure(siz int32,
-//	fn func(arg0, arg1, arg2 *ptr, callerpc uintptr, xxx) yyy,
-//	arg0, arg1, arg2 *ptr) (func(xxx) yyy)
-void
-runtime·closure(int32 siz, byte *fn, byte *arg0)
-{
-	byte *p, *q, **ret;
-	int32 i, n;
-	int32 pcrel;
-
-	if(siz < 0 || siz%4 != 0)
-		runtime·throw("bad closure size");
-
-	ret = (byte**)((byte*)&arg0 + siz);
-
-	if(siz > 100) {
-		// TODO(rsc): implement stack growth preamble?
-		runtime·throw("closure too big");
-	}
-
-	// compute size of new fn.
-	// must match code laid out below.
-	n = 6+5+2+1;	// SUBL MOVL MOVL CLD
-	if(siz <= 4*4)
-		n += 1*siz/4;	// MOVSL MOVSL...
-	else
-		n += 6+2;	// MOVL REP MOVSL
-	n += 5;	// CALL
-	n += 6+1;	// ADDL RET
-
-	// store args aligned after code, so gc can find them.
-	n += siz;
-	if(n%4)
-		n += 4 - n%4;
-
-	p = runtime·mal(n);
-	*ret = p;
-	q = p + n - siz;
-
-	if(siz > 0) {
-		runtime·memmove(q, (byte*)&arg0, siz);
-
-		// SUBL $siz, SP
-		*p++ = 0x81;
-		*p++ = 0xec;
-		*(uint32*)p = siz;
-		p += 4;
-
-		// MOVL $q, SI
-		*p++ = 0xbe;
-		*(byte**)p = q;
-		p += 4;
-
-		// MOVL SP, DI
-		*p++ = 0x89;
-		*p++ = 0xe7;
-
-		// CLD
-		*p++ = 0xfc;
-
-		if(siz <= 4*4) {
-			for(i=0; i<siz; i+=4) {
-				// MOVSL
-				*p++ = 0xa5;
-			}
-		} else {
-			// MOVL $(siz/4), CX  [32-bit immediate siz/4]
-			*p++ = 0xc7;
-			*p++ = 0xc1;
-			*(uint32*)p = siz/4;
-			p += 4;
-
-			// REP; MOVSL
-			*p++ = 0xf3;
-			*p++ = 0xa5;
-		}
-	}
-
-	// call fn
-	pcrel = fn - (p+5);
-	// direct call with pc-relative offset
-	// CALL fn
-	*p++ = 0xe8;
-	*(int32*)p = pcrel;
-	p += 4;
-
-	// ADDL $siz, SP
-	*p++ = 0x81;
-	*p++ = 0xc4;
-	*(uint32*)p = siz;
-	p += 4;
-
-	// RET
-	*p++ = 0xc3;
-
-	if(p > q)
-		runtime·throw("bad math in sys.closure");
-}
diff --git a/src/pkg/runtime/closure_amd64.c b/src/pkg/runtime/closure_amd64.c
deleted file mode 100644
index 481b4a888..000000000
--- a/src/pkg/runtime/closure_amd64.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-
-#pragma textflag 7
-// func closure(siz int32,
-//	fn func(arg0, arg1, arg2 *ptr, callerpc uintptr, xxx) yyy,
-//	arg0, arg1, arg2 *ptr) (func(xxx) yyy)
-void
-runtime·closure(int32 siz, byte *fn, byte *arg0)
-{
-	byte *p, *q, **ret;
-	int32 i, n;
-	int64 pcrel;
-
-	if(siz < 0 || siz%8 != 0)
-		runtime·throw("bad closure size");
-
-	ret = (byte**)((byte*)&arg0 + siz);
-
-	if(siz > 100) {
-		// TODO(rsc): implement stack growth preamble?
-		runtime·throw("closure too big");
-	}
-
-	// compute size of new fn.
-	// must match code laid out below.
-	n = 7+10+3;	// SUBQ MOVQ MOVQ
-	if(siz <= 4*8)
-		n += 2*siz/8;	// MOVSQ MOVSQ...
-	else
-		n += 7+3;	// MOVQ REP MOVSQ
-	n += 12;	// CALL worst case; sometimes only 5
-	n += 7+1;	// ADDQ RET
-
-	// store args aligned after code, so gc can find them.
-	n += siz;
-	if(n%8)
-		n += 8 - n%8;
-
-	p = runtime·mal(n);
-	*ret = p;
-	q = p + n - siz;
-
-	if(siz > 0) {
-		runtime·memmove(q, (byte*)&arg0, siz);
-
-		// SUBQ $siz, SP
-		*p++ = 0x48;
-		*p++ = 0x81;
-		*p++ = 0xec;
-		*(uint32*)p = siz;
-		p += 4;
-
-		// MOVQ $q, SI
-		*p++ = 0x48;
-		*p++ = 0xbe;
-		*(byte**)p = q;
-		p += 8;
-
-		// MOVQ SP, DI
-		*p++ = 0x48;
-		*p++ = 0x89;
-		*p++ = 0xe7;
-
-		if(siz <= 4*8) {
-			for(i=0; i<siz; i+=8) {
-				// MOVSQ
-				*p++ = 0x48;
-				*p++ = 0xa5;
-			}
-		} else {
-			// MOVQ $(siz/8), CX  [32-bit immediate siz/8]
-			*p++ = 0x48;
-			*p++ = 0xc7;
-			*p++ = 0xc1;
-			*(uint32*)p = siz/8;
-			p += 4;
-
-			// REP; MOVSQ
-			*p++ = 0xf3;
-			*p++ = 0x48;
-			*p++ = 0xa5;
-		}
-	}
-
-	// call fn
-	pcrel = fn - (p+5);
-	if((int32)pcrel == pcrel) {
-		// can use direct call with pc-relative offset
-		// CALL fn
-		*p++ = 0xe8;
-		*(int32*)p = pcrel;
-		p += 4;
-	} else {
-		// MOVQ $fn, CX  [64-bit immediate fn]
-		*p++ = 0x48;
-		*p++ = 0xb9;
-		*(byte**)p = fn;
-		p += 8;
-
-		// CALL *CX
-		*p++ = 0xff;
-		*p++ = 0xd1;
-	}
-
-	// ADDQ $siz, SP
-	*p++ = 0x48;
-	*p++ = 0x81;
-	*p++ = 0xc4;
-	*(uint32*)p = siz;
-	p += 4;
-
-	// RET
-	*p++ = 0xc3;
-
-	if(p > q)
-		runtime·throw("bad math in sys.closure");
-}
-
-
diff --git a/src/pkg/runtime/closure_arm.c b/src/pkg/runtime/closure_arm.c
deleted file mode 100644
index 119e91b61..000000000
--- a/src/pkg/runtime/closure_arm.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "runtime.h"
-
-/*
-	There are two bits of magic:
-	- The signature of the compiler generated function uses two stack frames
-	as arguments (callerpc separates these frames)
-	- size determines how many arguments runtime.closure actually has
-	starting at arg0.
-
-	Example closure with 3 captured variables:
-	func closure(siz int32,
-	fn func(arg0, arg1, arg2 *ptr, callerpc uintptr, xxx) yyy,
-		arg0, arg1, arg2 *ptr) (func(xxx) yyy)
-
-	Code generated:
-	src R0
-	dst R1
-	end R3
-	tmp R4
-	frame = siz+4
-
-//skip loop for 0 size closures
-		MOVW.W	R14,-frame(R13)
-
-		MOVW	$vars(PC), R0
-		MOVW	$4(SP), R1
-		MOVW	$siz(R0), R3
-loop:		MOVW.P	4(R0), R4
-		MOVW.P	R4, 4(R1)
-		CMP		R0, R3
-		BNE		loop
-
-		MOVW	8(PC), R0
-		BL		(R0)			// 2 words
-		MOVW.P	frame(R13),R15
-fptr:		WORD	*fn
-vars:		WORD	arg0
-		WORD	arg1
-		WORD	arg2
-*/
-
-extern void runtime·cacheflush(byte* start, byte* end);
-
-#pragma textflag 7
-void
-runtime·closure(int32 siz, byte *fn, byte *arg0)
-{
-	byte *p, *q, **ret;
-	uint32 *pc;
-	int32 n;
-
-	if(siz < 0 || siz%4 != 0)
-		runtime·throw("bad closure size");
-
-	ret = (byte**)((byte*)&arg0 + siz);
-
-	if(siz > 100) {
-		// TODO(kaib): implement stack growth preamble?
-		runtime·throw("closure too big");
-	}
-
-	// size of new fn.
-	// must match code laid out below.
-	if (siz > 0)
-		n = 6 * 4 + 7 * 4;
-	else
-		n = 6 * 4;
-
-	// store args aligned after code, so gc can find them.
-	n += siz;
-
-	p = runtime·mal(n);
-	*ret = p;
-	q = p + n - siz;
-
-	pc = (uint32*)p;
-
-	//	MOVW.W	R14,-frame(R13)
-	*pc++ = 0xe52de000 | (siz + 4);
-
-	if(siz > 0) {
-		runtime·memmove(q, (byte*)&arg0, siz);
-
-		//	MOVW	$vars(PC), R0
-		*pc = 0xe28f0000 | (int32)(q - (byte*)pc - 8);
-		pc++;
-
-		//	MOVW	$4(SP), R1
-		*pc++ = 0xe28d1004;
-
-		//	MOVW	$siz(R0), R3
-		*pc++ = 0xe2803000 | siz;
-
-		//	MOVW.P	4(R0), R4
-		*pc++ = 0xe4904004;
-		//	MOVW.P	R4, 4(R1)
-		*pc++ = 0xe4814004;
-		//	CMP		R0, R3
-		*pc++ = 0xe1530000;
-		//	BNE		loop
-		*pc++ = 0x1afffffb;
-	}
-
-	//	MOVW	fptr(PC), R0
-	*pc = 0xe59f0008 | (int32)((q - 4) -(byte*) pc - 8);
-	pc++;
-
-	//	BL		(R0)
-	*pc++ = 0xe28fe000;
-	*pc++ = 0xe280f000;
-
-	//	MOVW.P	frame(R13),R15
-	*pc++ = 0xe49df000 | (siz + 4);
-
-	//	WORD	*fn
-	*pc++ = (uint32)fn;
-
-	p = (byte*)pc;
-
-	if(p > q)
-		runtime·throw("bad math in sys.closure");
-
-	runtime·cacheflush(*ret, q+siz);
-}
-
diff --git a/src/pkg/runtime/complex.c b/src/pkg/runtime/complex.c
index eeb943940..395e70fe3 100644
--- a/src/pkg/runtime/complex.c
+++ b/src/pkg/runtime/complex.c
@@ -13,28 +13,30 @@ runtime·complex128div(Complex128 n, Complex128 d, Complex128 q)
 	float64 a, b, ratio, denom;
 
 	// Special cases as in C99.
-	ninf = runtime·isInf(n.real, 0) || runtime·isInf(n.imag, 0);
-	dinf = runtime·isInf(d.real, 0) || runtime·isInf(d.imag, 0);
+	ninf = n.real == runtime·posinf || n.real == runtime·neginf ||
+	       n.imag == runtime·posinf || n.imag == runtime·neginf;
+	dinf = d.real == runtime·posinf || d.real == runtime·neginf ||
+	       d.imag == runtime·posinf || d.imag == runtime·neginf;
 
-	nnan = !ninf && (runtime·isNaN(n.real) || runtime·isNaN(n.imag));
-	dnan = !dinf && (runtime·isNaN(d.real) || runtime·isNaN(d.imag));
+	nnan = !ninf && (ISNAN(n.real) || ISNAN(n.imag));
+	dnan = !dinf && (ISNAN(d.real) || ISNAN(d.imag));
 
 	if(nnan || dnan) {
-		q.real = runtime·NaN();
-		q.imag = runtime·NaN();
-	} else if(ninf && !dinf && !dnan) {
-		q.real = runtime·Inf(0);
-		q.imag = runtime·Inf(0);
-	} else if(!ninf && !nnan && dinf) {
+		q.real = runtime·nan;
+		q.imag = runtime·nan;
+	} else if(ninf && !dinf) {
+		q.real = runtime·posinf;
+		q.imag = runtime·posinf;
+	} else if(!ninf && dinf) {
 		q.real = 0;
 		q.imag = 0;
 	} else if(d.real == 0 && d.imag == 0) {
 		if(n.real == 0 && n.imag == 0) {
-			q.real = runtime·NaN();
-			q.imag = runtime·NaN();
+			q.real = runtime·nan;
+			q.imag = runtime·nan;
 		} else {
-			q.real = runtime·Inf(0);
-			q.imag = runtime·Inf(0);
+			q.real = runtime·posinf;
+			q.imag = runtime·posinf;
 		}
 	} else {
 		// Standard complex arithmetic, factored to avoid unnecessary overflow.
diff --git a/src/pkg/runtime/complex_test.go b/src/pkg/runtime/complex_test.go
new file mode 100644
index 000000000..f41e6a357
--- /dev/null
+++ b/src/pkg/runtime/complex_test.go
@@ -0,0 +1,67 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math/cmplx"
+	"testing"
+)
+
+var result complex128
+
+func BenchmarkComplex128DivNormal(b *testing.B) {
+	d := 15 + 2i
+	n := 32 + 3i
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		n += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivNisNaN(b *testing.B) {
+	d := cmplx.NaN()
+	n := 32 + 3i
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		n += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivDisNaN(b *testing.B) {
+	d := 15 + 2i
+	n := cmplx.NaN()
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		d += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivNisInf(b *testing.B) {
+	d := 15 + 2i
+	n := cmplx.Inf()
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		d += 0.1i
+		res += n / d
+	}
+	result = res
+}
+
+func BenchmarkComplex128DivDisInf(b *testing.B) {
+	d := cmplx.Inf()
+	n := 32 + 3i
+	res := 0i
+	for i := 0; i < b.N; i++ {
+		n += 0.1i
+		res += n / d
+	}
+	result = res
+}
diff --git a/src/pkg/runtime/cpuprof.c b/src/pkg/runtime/cpuprof.c
index 05fa0cf61..9a0606a22 100644
--- a/src/pkg/runtime/cpuprof.c
+++ b/src/pkg/runtime/cpuprof.c
@@ -99,6 +99,7 @@ struct Profile {
 	uint32 wtoggle;
 	bool wholding;	// holding & need to release a log half
 	bool flushing;	// flushing hash table - profile is over
+	bool eod_sent;  // special end-of-data record sent; => flushing
 };
 
 static Lock lk;
@@ -109,16 +110,20 @@ static void add(Profile*, uintptr*, int32);
 static bool evict(Profile*, Entry*);
 static bool flushlog(Profile*);
 
+static uintptr eod[3] = {0, 1, 0};
+
 // LostProfileData is a no-op function used in profiles
 // to mark the number of profiling stack traces that were
 // discarded due to slow data writers.
-static void LostProfileData(void) {
+static void
+LostProfileData(void)
+{
 }
 
 // SetCPUProfileRate sets the CPU profiling rate.
 // The user documentation is in debug.go.
 void
-runtime·SetCPUProfileRate(int32 hz)
+runtime·SetCPUProfileRate(intgo hz)
 {
 	uintptr *p;
 	uintptr n;
@@ -163,6 +168,7 @@ runtime·SetCPUProfileRate(int32 hz)
 		prof->wholding = false;
 		prof->wtoggle = 0;
 		prof->flushing = false;
+		prof->eod_sent = false;
 		runtime·noteclear(&prof->wait);
 
 		runtime·setcpuprofilerate(tick, hz);
@@ -356,7 +362,7 @@ getprofile(Profile *p)
 		return ret;
 
 	// Wait for new log.
-	runtime·entersyscall();
+	runtime·entersyscallblock();
 	runtime·notesleep(&p->wait);
 	runtime·exitsyscall();
 	runtime·noteclear(&p->wait);
@@ -409,6 +415,16 @@ breakflush:
 	}
 
 	// Made it through the table without finding anything to log.
+	if(!p->eod_sent) {
+		// We may not have space to append this to the partial log buf,
+		// so we always return a new slice for the end-of-data marker.
+		p->eod_sent = true;
+		ret.array = (byte*)eod;
+		ret.len = sizeof eod;
+		ret.cap = ret.len;
+		return ret;
+	}
+
 	// Finally done.  Clean up and return nil.
 	p->flushing = false;
 	if(!runtime·cas(&p->handoff, p->handoff, 0))
diff --git a/src/pkg/runtime/crash_cgo_test.go b/src/pkg/runtime/crash_cgo_test.go
new file mode 100644
index 000000000..8ccea8f37
--- /dev/null
+++ b/src/pkg/runtime/crash_cgo_test.go
@@ -0,0 +1,88 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+
+package runtime_test
+
+import (
+	"testing"
+)
+
+func TestCgoCrashHandler(t *testing.T) {
+	testCrashHandler(t, true)
+}
+
+func TestCgoSignalDeadlock(t *testing.T) {
+	got := executeTest(t, cgoSignalDeadlockSource, nil)
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+const cgoSignalDeadlockSource = `
+package main
+
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"time"
+)
+
+func main() {
+	runtime.GOMAXPROCS(100)
+	ping := make(chan bool)
+	go func() {
+		for i := 0; ; i++ {
+			runtime.Gosched()
+			select {
+			case done := <-ping:
+				if done {
+					ping <- true
+					return
+				}
+				ping <- true
+			default:
+			}
+			func() {
+				defer func() {
+					recover()
+				}()
+				var s *string
+				*s = ""
+			}()
+		}
+	}()
+	time.Sleep(time.Millisecond)
+	for i := 0; i < 64; i++ {
+		go func() {
+			runtime.LockOSThread()
+			select {}
+		}()
+		go func() {
+			runtime.LockOSThread()
+			select {}
+		}()
+		time.Sleep(time.Millisecond)
+		ping <- false
+		select {
+		case <-ping:
+		case <-time.After(time.Second):
+			fmt.Printf("HANG\n")
+			return
+		}
+	}
+	ping <- true
+	select {
+	case <-ping:
+	case <-time.After(time.Second):
+		fmt.Printf("HANG\n")
+		return
+	}
+	fmt.Printf("OK\n")
+}
+`
diff --git a/src/pkg/runtime/crash_test.go b/src/pkg/runtime/crash_test.go
new file mode 100644
index 000000000..5f84cb5a2
--- /dev/null
+++ b/src/pkg/runtime/crash_test.go
@@ -0,0 +1,177 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+	"text/template"
+)
+
+func executeTest(t *testing.T, templ string, data interface{}) string {
+	checkStaleRuntime(t)
+
+	st := template.Must(template.New("crashSource").Parse(templ))
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	src := filepath.Join(dir, "main.go")
+	f, err := os.Create(src)
+	if err != nil {
+		t.Fatalf("failed to create %v: %v", src, err)
+	}
+	err = st.Execute(f, data)
+	if err != nil {
+		f.Close()
+		t.Fatalf("failed to execute template: %v", err)
+	}
+	f.Close()
+
+	got, _ := exec.Command("go", "run", src).CombinedOutput()
+	return string(got)
+}
+
+func checkStaleRuntime(t *testing.T) {
+	// 'go run' uses the installed copy of runtime.a, which may be out of date.
+	out, err := exec.Command("go", "list", "-f", "{{.Stale}}", "runtime").CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to execute 'go list': %v\n%v", err, string(out))
+	}
+	if string(out) != "false\n" {
+		t.Fatalf("Stale runtime.a. Run 'go install runtime'.")
+	}
+}
+
+func testCrashHandler(t *testing.T, cgo bool) {
+	type crashTest struct {
+		Cgo bool
+	}
+	got := executeTest(t, crashSource, &crashTest{Cgo: cgo})
+	want := "main: recovered done\nnew-thread: recovered done\nsecond-new-thread: recovered done\nmain-again: recovered done\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+func TestCrashHandler(t *testing.T) {
+	testCrashHandler(t, false)
+}
+
+func testDeadlock(t *testing.T, source string) {
+	got := executeTest(t, source, nil)
+	want := "fatal error: all goroutines are asleep - deadlock!\n"
+	if !strings.HasPrefix(got, want) {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+func TestSimpleDeadlock(t *testing.T) {
+	testDeadlock(t, simpleDeadlockSource)
+}
+
+func TestInitDeadlock(t *testing.T) {
+	testDeadlock(t, initDeadlockSource)
+}
+
+func TestLockedDeadlock(t *testing.T) {
+	testDeadlock(t, lockedDeadlockSource)
+}
+
+func TestLockedDeadlock2(t *testing.T) {
+	testDeadlock(t, lockedDeadlockSource2)
+}
+
+const crashSource = `
+package main
+
+import (
+	"fmt"
+	"runtime"
+)
+
+{{if .Cgo}}
+import "C"
+{{end}}
+
+func test(name string) {
+	defer func() {
+		if x := recover(); x != nil {
+			fmt.Printf(" recovered")
+		}
+		fmt.Printf(" done\n")
+	}()
+	fmt.Printf("%s:", name)
+	var s *string
+	_ = *s
+	fmt.Print("SHOULD NOT BE HERE")
+}
+
+func testInNewThread(name string) {
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		test(name)
+		c <- true
+	}()
+	<-c
+}
+
+func main() {
+	runtime.LockOSThread()
+	test("main")
+	testInNewThread("new-thread")
+	testInNewThread("second-new-thread")
+	test("main-again")
+}
+`
+
+const simpleDeadlockSource = `
+package main
+func main() {
+	select {}
+}
+`
+
+const initDeadlockSource = `
+package main
+func init() {
+	select {}
+}
+func main() {
+}
+`
+
+const lockedDeadlockSource = `
+package main
+import "runtime"
+func main() {
+	runtime.LockOSThread()
+	select {}
+}
+`
+
+const lockedDeadlockSource2 = `
+package main
+import (
+	"runtime"
+	"time"
+)
+func main() {
+	go func() {
+		runtime.LockOSThread()
+		select {}
+	}()
+	time.Sleep(time.Millisecond)
+	select {}
+}
+`
diff --git a/src/pkg/runtime/debug.go b/src/pkg/runtime/debug.go
index b802fc63f..d82afb08e 100644
--- a/src/pkg/runtime/debug.go
+++ b/src/pkg/runtime/debug.go
@@ -4,7 +4,7 @@
 
 package runtime
 
-// Breakpoint() executes a breakpoint trap.
+// Breakpoint executes a breakpoint trap.
 func Breakpoint()
 
 // LockOSThread wires the calling goroutine to its current operating system thread.
@@ -125,6 +125,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool)
 // blocking until data is available.  If profiling is turned off and all the profile
 // data accumulated while it was on has been returned, CPUProfile returns nil.
 // The caller must save the returned data before calling CPUProfile again.
+//
 // Most clients should use the runtime/pprof package or
 // the testing package's -test.cpuprofile flag instead of calling
 // CPUProfile directly.
@@ -133,11 +134,37 @@ func CPUProfile() []byte
 // SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
 // If hz <= 0, SetCPUProfileRate turns off profiling.
 // If the profiler is on, the rate cannot be changed without first turning it off.
+//
 // Most clients should use the runtime/pprof package or
 // the testing package's -test.cpuprofile flag instead of calling
 // SetCPUProfileRate directly.
 func SetCPUProfileRate(hz int)
 
+// SetBlockProfileRate controls the fraction of goroutine blocking events
+// that are reported in the blocking profile.  The profiler aims to sample
+// an average of one blocking event per rate nanoseconds spent blocked.
+//
+// To include every blocking event in the profile, pass rate = 1.
+// To turn off profiling entirely, pass rate <= 0.
+func SetBlockProfileRate(rate int)
+
+// BlockProfileRecord describes blocking events originated
+// at a particular call sequence (stack trace).
+type BlockProfileRecord struct {
+	Count  int64
+	Cycles int64
+	StackRecord
+}
+
+// BlockProfile returns n, the number of records in the current blocking profile.
+// If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
+// If len(p) < n, BlockProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package or
+// the testing package's -test.blockprofile flag instead
+// of calling BlockProfile directly.
+func BlockProfile(p []BlockProfileRecord) (n int, ok bool)
+
 // Stack formats a stack trace of the calling goroutine into buf
 // and returns the number of bytes written to buf.
 // If all is true, Stack formats stack traces of all other goroutines
diff --git a/src/pkg/runtime/debug/debug.c b/src/pkg/runtime/debug/debug.c
new file mode 100644
index 000000000..a7292c477
--- /dev/null
+++ b/src/pkg/runtime/debug/debug.c
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Nothing to see here.
+// This file exists so that the go command knows that parts of the
+// package are implemented in C, so that it does not instruct the
+// Go compiler to complain about extern declarations.
+// The actual implementations are in package runtime.
diff --git a/src/pkg/runtime/debug/garbage.go b/src/pkg/runtime/debug/garbage.go
new file mode 100644
index 000000000..8f3026426
--- /dev/null
+++ b/src/pkg/runtime/debug/garbage.go
@@ -0,0 +1,101 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"runtime"
+	"sort"
+	"time"
+)
+
+// GCStats collect information about recent garbage collections.
+type GCStats struct {
+	LastGC         time.Time       // time of last collection
+	NumGC          int64           // number of garbage collections
+	PauseTotal     time.Duration   // total pause for all collections
+	Pause          []time.Duration // pause history, most recent first
+	PauseQuantiles []time.Duration
+}
+
+// Implemented in package runtime.
+func readGCStats(*[]time.Duration)
+func enableGC(bool) bool
+func setGCPercent(int) int
+func freeOSMemory()
+
+// ReadGCStats reads statistics about garbage collection into stats.
+// The number of entries in the pause history is system-dependent;
+// stats.Pause slice will be reused if large enough, reallocated otherwise.
+// ReadGCStats may use the full capacity of the stats.Pause slice.
+// If stats.PauseQuantiles is non-empty, ReadGCStats fills it with quantiles
+// summarizing the distribution of pause time. For example, if
+// len(stats.PauseQuantiles) is 5, it will be filled with the minimum,
+// 25%, 50%, 75%, and maximum pause times.
+func ReadGCStats(stats *GCStats) {
+	// Create a buffer with space for at least two copies of the
+	// pause history tracked by the runtime. One will be returned
+	// to the caller and the other will be used as a temporary buffer
+	// for computing quantiles.
+	const maxPause = len(((*runtime.MemStats)(nil)).PauseNs)
+	if cap(stats.Pause) < 2*maxPause {
+		stats.Pause = make([]time.Duration, 2*maxPause)
+	}
+
+	// readGCStats fills in the pause history (up to maxPause entries)
+	// and then three more: Unix ns time of last GC, number of GC,
+	// and total pause time in nanoseconds. Here we depend on the
+	// fact that time.Duration's native unit is nanoseconds, so the
+	// pauses and the total pause time do not need any conversion.
+	readGCStats(&stats.Pause)
+	n := len(stats.Pause) - 3
+	stats.LastGC = time.Unix(0, int64(stats.Pause[n]))
+	stats.NumGC = int64(stats.Pause[n+1])
+	stats.PauseTotal = stats.Pause[n+2]
+	stats.Pause = stats.Pause[:n]
+
+	if len(stats.PauseQuantiles) > 0 {
+		if n == 0 {
+			for i := range stats.PauseQuantiles {
+				stats.PauseQuantiles[i] = 0
+			}
+		} else {
+			// There's room for a second copy of the data in stats.Pause.
+			// See the allocation at the top of the function.
+			sorted := stats.Pause[n : n+n]
+			copy(sorted, stats.Pause)
+			sort.Sort(byDuration(sorted))
+			nq := len(stats.PauseQuantiles) - 1
+			for i := 0; i < nq; i++ {
+				stats.PauseQuantiles[i] = sorted[len(sorted)*i/nq]
+			}
+			stats.PauseQuantiles[nq] = sorted[len(sorted)-1]
+		}
+	}
+}
+
+type byDuration []time.Duration
+
+func (x byDuration) Len() int           { return len(x) }
+func (x byDuration) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byDuration) Less(i, j int) bool { return x[i] < x[j] }
+
+// SetGCPercent sets the garbage collection target percentage:
+// a collection is triggered when the ratio of freshly allocated data
+// to live data remaining after the previous collection reaches this percentage.
+// SetGCPercent returns the previous setting.
+// The initial setting is the value of the GOGC environment variable
+// at startup, or 100 if the variable is not set.
+// A negative percentage disables garbage collection.
+func SetGCPercent(percent int) int {
+	return setGCPercent(percent)
+}
+
+// FreeOSMemory forces a garbage collection followed by an
+// attempt to return as much memory to the operating system
+// as possible. (Even if this is not called, the runtime gradually
+// returns memory to the operating system in a background task.)
+func FreeOSMemory() {
+	freeOSMemory()
+}
diff --git a/src/pkg/runtime/debug/garbage_test.go b/src/pkg/runtime/debug/garbage_test.go
new file mode 100644
index 000000000..b93cfee56
--- /dev/null
+++ b/src/pkg/runtime/debug/garbage_test.go
@@ -0,0 +1,100 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"runtime"
+	"testing"
+	"time"
+)
+
+func TestReadGCStats(t *testing.T) {
+	var stats GCStats
+	var mstats runtime.MemStats
+	var min, max time.Duration
+
+	// First ReadGCStats will allocate, second should not,
+	// especially if we follow up with an explicit garbage collection.
+	stats.PauseQuantiles = make([]time.Duration, 10)
+	ReadGCStats(&stats)
+	runtime.GC()
+
+	// Assume these will return same data: no GC during ReadGCStats.
+	ReadGCStats(&stats)
+	runtime.ReadMemStats(&mstats)
+
+	if stats.NumGC != int64(mstats.NumGC) {
+		t.Errorf("stats.NumGC = %d, but mstats.NumGC = %d", stats.NumGC, mstats.NumGC)
+	}
+	if stats.PauseTotal != time.Duration(mstats.PauseTotalNs) {
+		t.Errorf("stats.PauseTotal = %d, but mstats.PauseTotalNs = %d", stats.PauseTotal, mstats.PauseTotalNs)
+	}
+	if stats.LastGC.UnixNano() != int64(mstats.LastGC) {
+		t.Errorf("stats.LastGC.UnixNano = %d, but mstats.LastGC = %d", stats.LastGC.UnixNano(), mstats.LastGC)
+	}
+	n := int(mstats.NumGC)
+	if n > len(mstats.PauseNs) {
+		n = len(mstats.PauseNs)
+	}
+	if len(stats.Pause) != n {
+		t.Errorf("len(stats.Pause) = %d, want %d", len(stats.Pause), n)
+	} else {
+		off := (int(mstats.NumGC) + len(mstats.PauseNs) - 1) % len(mstats.PauseNs)
+		for i := 0; i < n; i++ {
+			dt := stats.Pause[i]
+			if dt != time.Duration(mstats.PauseNs[off]) {
+				t.Errorf("stats.Pause[%d] = %d, want %d", i, dt, mstats.PauseNs[off])
+			}
+			if max < dt {
+				max = dt
+			}
+			if min > dt || i == 0 {
+				min = dt
+			}
+			off = (off + len(mstats.PauseNs) - 1) % len(mstats.PauseNs)
+		}
+	}
+
+	q := stats.PauseQuantiles
+	nq := len(q)
+	if q[0] != min || q[nq-1] != max {
+		t.Errorf("stats.PauseQuantiles = [%d, ..., %d], want [%d, ..., %d]", q[0], q[nq-1], min, max)
+	}
+
+	for i := 0; i < nq-1; i++ {
+		if q[i] > q[i+1] {
+			t.Errorf("stats.PauseQuantiles[%d]=%d > stats.PauseQuantiles[%d]=%d", i, q[i], i+1, q[i+1])
+		}
+	}
+}
+
+var big = make([]byte, 1<<20)
+
+func TestFreeOSMemory(t *testing.T) {
+	var ms1, ms2 runtime.MemStats
+
+	if big == nil {
+		t.Skip("test is not reliable when run multiple times")
+	}
+	big = nil
+	runtime.GC()
+	runtime.ReadMemStats(&ms1)
+	FreeOSMemory()
+	runtime.ReadMemStats(&ms2)
+	if ms1.HeapReleased >= ms2.HeapReleased {
+		t.Errorf("released before=%d; released after=%d; did not go up", ms1.HeapReleased, ms2.HeapReleased)
+	}
+}
+
+func TestSetGCPercent(t *testing.T) {
+	// Test that the variable is being set and returned correctly.
+	// Assume the percentage itself is implemented fine during GC,
+	// which is harder to test.
+	old := SetGCPercent(123)
+	new := SetGCPercent(old)
+	if new != 123 {
+		t.Errorf("SetGCPercent(123); SetGCPercent(x) = %d, want 123", new)
+	}
+}
diff --git a/src/pkg/runtime/debug/stack.go b/src/pkg/runtime/debug/stack.go
index a533a5c3b..2896b2141 100644
--- a/src/pkg/runtime/debug/stack.go
+++ b/src/pkg/runtime/debug/stack.go
@@ -29,6 +29,8 @@ func PrintStack() {
 // For each routine, it includes the source line information and PC value,
 // then attempts to discover, for Go functions, the calling function or
 // method and the text of the line containing the invocation.
+//
+// This function is deprecated. Use package runtime's Stack instead.
 func Stack() []byte {
 	return stack()
 }
diff --git a/src/pkg/runtime/debug/stack_test.go b/src/pkg/runtime/debug/stack_test.go
index f33f5072b..bbd662618 100644
--- a/src/pkg/runtime/debug/stack_test.go
+++ b/src/pkg/runtime/debug/stack_test.go
@@ -36,7 +36,7 @@ func (t T) method() []byte {
 func TestStack(t *testing.T) {
 	b := T(0).method()
 	lines := strings.Split(string(b), "\n")
-	if len(lines) <= 6 {
+	if len(lines) < 6 {
 		t.Fatal("too few lines")
 	}
 	n := 0
diff --git a/src/pkg/runtime/defs_freebsd.go b/src/pkg/runtime/defs_freebsd.go
index 306e32197..084022715 100644
--- a/src/pkg/runtime/defs_freebsd.go
+++ b/src/pkg/runtime/defs_freebsd.go
@@ -7,8 +7,9 @@
 /*
 Input to cgo.
 
-GOARCH=amd64 cgo -cdefs defs.go >amd64/defs.h
-GOARCH=386 cgo -cdefs defs.go >386/defs.h
+GOARCH=amd64 go tool cgo -cdefs defs_freebsd.go >defs_freebsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_freebsd.go >defs_freebsd_386.h
+GOARCH=arm go tool cgo -cdefs defs_freebsd.go >defs_freebsd_arm.h
 */
 
 package runtime
@@ -38,6 +39,8 @@ const (
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
+	MADV_FREE = C.MADV_FREE
+
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
 	SA_ONSTACK = C.SA_ONSTACK
@@ -104,7 +107,6 @@ type Rtprio C.struct_rtprio
 type ThrParam C.struct_thr_param
 type Sigaltstack C.struct_sigaltstack
 type Sigset C.struct___sigset
-type Sigval C.union_sigval
 type StackT C.stack_t
 
 type Siginfo C.siginfo_t
diff --git a/src/pkg/runtime/defs_freebsd_386.h b/src/pkg/runtime/defs_freebsd_386.h
index 29fcb8b57..d00c852c6 100644
--- a/src/pkg/runtime/defs_freebsd_386.h
+++ b/src/pkg/runtime/defs_freebsd_386.h
@@ -1,193 +1,190 @@
-// godefs -f -m32 defs.c
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_freebsd.go
 
-// MACHINE GENERATED - DO NOT EDIT.
 
-// Constants
 enum {
-	PROT_NONE = 0,
-	PROT_READ = 0x1,
-	PROT_WRITE = 0x2,
-	PROT_EXEC = 0x4,
-	MAP_ANON = 0x1000,
-	MAP_PRIVATE = 0x2,
-	MAP_FIXED = 0x10,
-	SA_SIGINFO = 0x40,
-	SA_RESTART = 0x2,
-	SA_ONSTACK = 0x1,
-	UMTX_OP_WAIT = 0x2,
-	UMTX_OP_WAKE = 0x3,
-	EINTR = 0x4,
-	SIGHUP = 0x1,
-	SIGINT = 0x2,
-	SIGQUIT = 0x3,
-	SIGILL = 0x4,
-	SIGTRAP = 0x5,
-	SIGABRT = 0x6,
-	SIGEMT = 0x7,
-	SIGFPE = 0x8,
-	SIGKILL = 0x9,
-	SIGBUS = 0xa,
-	SIGSEGV = 0xb,
-	SIGSYS = 0xc,
-	SIGPIPE = 0xd,
-	SIGALRM = 0xe,
-	SIGTERM = 0xf,
-	SIGURG = 0x10,
-	SIGSTOP = 0x11,
-	SIGTSTP = 0x12,
-	SIGCONT = 0x13,
-	SIGCHLD = 0x14,
-	SIGTTIN = 0x15,
-	SIGTTOU = 0x16,
-	SIGIO = 0x17,
-	SIGXCPU = 0x18,
-	SIGXFSZ = 0x19,
-	SIGVTALRM = 0x1a,
-	SIGPROF = 0x1b,
-	SIGWINCH = 0x1c,
-	SIGINFO = 0x1d,
-	SIGUSR1 = 0x1e,
-	SIGUSR2 = 0x1f,
-	FPE_INTDIV = 0x2,
-	FPE_INTOVF = 0x1,
-	FPE_FLTDIV = 0x3,
-	FPE_FLTOVF = 0x4,
-	FPE_FLTUND = 0x5,
-	FPE_FLTRES = 0x6,
-	FPE_FLTINV = 0x7,
-	FPE_FLTSUB = 0x8,
-	BUS_ADRALN = 0x1,
-	BUS_ADRERR = 0x2,
-	BUS_OBJERR = 0x3,
-	SEGV_MAPERR = 0x1,
-	SEGV_ACCERR = 0x2,
-	ITIMER_REAL = 0,
-	ITIMER_VIRTUAL = 0x1,
-	ITIMER_PROF = 0x2,
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	UMTX_OP_WAIT	= 0x2,
+	UMTX_OP_WAKE	= 0x3,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
 };
 
-// Types
+typedef struct Rtprio Rtprio;
+typedef struct ThrParam ThrParam;
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+
 #pragma pack on
 
-typedef struct Rtprio Rtprio;
 struct Rtprio {
-	uint16 type;
-	uint16 prio;
+	uint16	type;
+	uint16	prio;
 };
-
-typedef struct ThrParam ThrParam;
 struct ThrParam {
-	void *start_func;
-	void *arg;
-	int8 *stack_base;
-	uint32 stack_size;
-	int8 *tls_base;
-	uint32 tls_size;
-	int32 *child_tid;
-	int32 *parent_tid;
-	int32 flags;
-	Rtprio *rtp;
-	void* spare[3];
+	void	*start_func;
+	byte	*arg;
+	int8	*stack_base;
+	uint32	stack_size;
+	int8	*tls_base;
+	uint32	tls_size;
+	int32	*child_tid;
+	int32	*parent_tid;
+	int32	flags;
+	Rtprio	*rtp;
+	void	*spare[3];
 };
-
-typedef struct Sigaltstack Sigaltstack;
 struct Sigaltstack {
-	int8 *ss_sp;
-	uint32 ss_size;
-	int32 ss_flags;
+	int8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
 };
-
-typedef struct Sigset Sigset;
 struct Sigset {
-	uint32 __bits[4];
-};
-
-typedef union Sigval Sigval;
-union Sigval {
-	int32 sival_int;
-	void *sival_ptr;
-	int32 sigval_int;
-	void *sigval_ptr;
+	uint32	__bits[4];
 };
-
-typedef struct StackT StackT;
 struct StackT {
-	int8 *ss_sp;
-	uint32 ss_size;
-	int32 ss_flags;
+	int8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
 };
 
-typedef struct Siginfo Siginfo;
 struct Siginfo {
-	int32 si_signo;
-	int32 si_errno;
-	int32 si_code;
-	int32 si_pid;
-	uint32 si_uid;
-	int32 si_status;
-	void *si_addr;
-	Sigval si_value;
-	byte _reason[32];
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[4];
+	byte	_reason[32];
 };
 
-typedef struct Mcontext Mcontext;
 struct Mcontext {
-	int32 mc_onstack;
-	int32 mc_gs;
-	int32 mc_fs;
-	int32 mc_es;
-	int32 mc_ds;
-	int32 mc_edi;
-	int32 mc_esi;
-	int32 mc_ebp;
-	int32 mc_isp;
-	int32 mc_ebx;
-	int32 mc_edx;
-	int32 mc_ecx;
-	int32 mc_eax;
-	int32 mc_trapno;
-	int32 mc_err;
-	int32 mc_eip;
-	int32 mc_cs;
-	int32 mc_eflags;
-	int32 mc_esp;
-	int32 mc_ss;
-	int32 mc_len;
-	int32 mc_fpformat;
-	int32 mc_ownedfp;
-	int32 mc_spare1[1];
-	int32 mc_fpstate[128];
-	int32 mc_fsbase;
-	int32 mc_gsbase;
-	int32 mc_spare2[6];
+	int32	mc_onstack;
+	int32	mc_gs;
+	int32	mc_fs;
+	int32	mc_es;
+	int32	mc_ds;
+	int32	mc_edi;
+	int32	mc_esi;
+	int32	mc_ebp;
+	int32	mc_isp;
+	int32	mc_ebx;
+	int32	mc_edx;
+	int32	mc_ecx;
+	int32	mc_eax;
+	int32	mc_trapno;
+	int32	mc_err;
+	int32	mc_eip;
+	int32	mc_cs;
+	int32	mc_eflags;
+	int32	mc_esp;
+	int32	mc_ss;
+	int32	mc_len;
+	int32	mc_fpformat;
+	int32	mc_ownedfp;
+	int32	mc_spare1[1];
+	int32	mc_fpstate[128];
+	int32	mc_fsbase;
+	int32	mc_gsbase;
+	int32	mc_spare2[6];
 };
-
-typedef struct Ucontext Ucontext;
 struct Ucontext {
-	Sigset uc_sigmask;
-	Mcontext uc_mcontext;
-	Ucontext *uc_link;
-	StackT uc_stack;
-	int32 uc_flags;
-	int32 __spare__[4];
-	byte pad_godefs_0[12];
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	uc_flags;
+	int32	__spare__[4];
+	byte	Pad_cgo_0[12];
 };
 
-typedef struct Timespec Timespec;
 struct Timespec {
-	int32 tv_sec;
-	int32 tv_nsec;
+	int32	tv_sec;
+	int32	tv_nsec;
 };
-
-typedef struct Timeval Timeval;
 struct Timeval {
-	int32 tv_sec;
-	int32 tv_usec;
+	int32	tv_sec;
+	int32	tv_usec;
 };
-
-typedef struct Itimerval Itimerval;
 struct Itimerval {
-	Timeval it_interval;
-	Timeval it_value;
+	Timeval	it_interval;
+	Timeval	it_value;
 };
+
+
 #pragma pack off
diff --git a/src/pkg/runtime/defs_freebsd_amd64.h b/src/pkg/runtime/defs_freebsd_amd64.h
index 8a222dca4..6348c0482 100644
--- a/src/pkg/runtime/defs_freebsd_amd64.h
+++ b/src/pkg/runtime/defs_freebsd_amd64.h
@@ -1,204 +1,201 @@
-// godefs -f -m64 defs.c
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_freebsd.go
 
-// MACHINE GENERATED - DO NOT EDIT.
 
-// Constants
 enum {
-	PROT_NONE = 0,
-	PROT_READ = 0x1,
-	PROT_WRITE = 0x2,
-	PROT_EXEC = 0x4,
-	MAP_ANON = 0x1000,
-	MAP_PRIVATE = 0x2,
-	MAP_FIXED = 0x10,
-	SA_SIGINFO = 0x40,
-	SA_RESTART = 0x2,
-	SA_ONSTACK = 0x1,
-	UMTX_OP_WAIT = 0x2,
-	UMTX_OP_WAKE = 0x3,
-	EINTR = 0x4,
-	SIGHUP = 0x1,
-	SIGINT = 0x2,
-	SIGQUIT = 0x3,
-	SIGILL = 0x4,
-	SIGTRAP = 0x5,
-	SIGABRT = 0x6,
-	SIGEMT = 0x7,
-	SIGFPE = 0x8,
-	SIGKILL = 0x9,
-	SIGBUS = 0xa,
-	SIGSEGV = 0xb,
-	SIGSYS = 0xc,
-	SIGPIPE = 0xd,
-	SIGALRM = 0xe,
-	SIGTERM = 0xf,
-	SIGURG = 0x10,
-	SIGSTOP = 0x11,
-	SIGTSTP = 0x12,
-	SIGCONT = 0x13,
-	SIGCHLD = 0x14,
-	SIGTTIN = 0x15,
-	SIGTTOU = 0x16,
-	SIGIO = 0x17,
-	SIGXCPU = 0x18,
-	SIGXFSZ = 0x19,
-	SIGVTALRM = 0x1a,
-	SIGPROF = 0x1b,
-	SIGWINCH = 0x1c,
-	SIGINFO = 0x1d,
-	SIGUSR1 = 0x1e,
-	SIGUSR2 = 0x1f,
-	FPE_INTDIV = 0x2,
-	FPE_INTOVF = 0x1,
-	FPE_FLTDIV = 0x3,
-	FPE_FLTOVF = 0x4,
-	FPE_FLTUND = 0x5,
-	FPE_FLTRES = 0x6,
-	FPE_FLTINV = 0x7,
-	FPE_FLTSUB = 0x8,
-	BUS_ADRALN = 0x1,
-	BUS_ADRERR = 0x2,
-	BUS_OBJERR = 0x3,
-	SEGV_MAPERR = 0x1,
-	SEGV_ACCERR = 0x2,
-	ITIMER_REAL = 0,
-	ITIMER_VIRTUAL = 0x1,
-	ITIMER_PROF = 0x2,
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	UMTX_OP_WAIT	= 0x2,
+	UMTX_OP_WAKE	= 0x3,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
 };
 
-// Types
+typedef struct Rtprio Rtprio;
+typedef struct ThrParam ThrParam;
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+
 #pragma pack on
 
-typedef struct Rtprio Rtprio;
 struct Rtprio {
-	uint16 type;
-	uint16 prio;
+	uint16	type;
+	uint16	prio;
 };
-
-typedef struct ThrParam ThrParam;
 struct ThrParam {
-	void *start_func;
-	void *arg;
-	int8 *stack_base;
-	uint64 stack_size;
-	int8 *tls_base;
-	uint64 tls_size;
-	int64 *child_tid;
-	int64 *parent_tid;
-	int32 flags;
-	byte pad_godefs_0[4];
-	Rtprio *rtp;
-	void* spare[3];
+	void	*start_func;
+	byte	*arg;
+	int8	*stack_base;
+	uint64	stack_size;
+	int8	*tls_base;
+	uint64	tls_size;
+	int64	*child_tid;
+	int64	*parent_tid;
+	int32	flags;
+	byte	Pad_cgo_0[4];
+	Rtprio	*rtp;
+	void	*spare[3];
 };
-
-typedef struct Sigaltstack Sigaltstack;
 struct Sigaltstack {
-	int8 *ss_sp;
-	uint64 ss_size;
-	int32 ss_flags;
-	byte pad_godefs_0[4];
+	int8	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
 };
-
-typedef struct Sigset Sigset;
 struct Sigset {
-	uint32 __bits[4];
-};
-
-typedef union Sigval Sigval;
-union Sigval {
-	int32 sival_int;
-	void *sival_ptr;
-	int32 sigval_int;
-	void *sigval_ptr;
+	uint32	__bits[4];
 };
-
-typedef struct StackT StackT;
 struct StackT {
-	int8 *ss_sp;
-	uint64 ss_size;
-	int32 ss_flags;
-	byte pad_godefs_0[4];
+	int8	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
 };
 
-typedef struct Siginfo Siginfo;
 struct Siginfo {
-	int32 si_signo;
-	int32 si_errno;
-	int32 si_code;
-	int32 si_pid;
-	uint32 si_uid;
-	int32 si_status;
-	void *si_addr;
-	Sigval si_value;
-	byte _reason[40];
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[8];
+	byte	_reason[40];
 };
 
-typedef struct Mcontext Mcontext;
 struct Mcontext {
-	int64 mc_onstack;
-	int64 mc_rdi;
-	int64 mc_rsi;
-	int64 mc_rdx;
-	int64 mc_rcx;
-	int64 mc_r8;
-	int64 mc_r9;
-	int64 mc_rax;
-	int64 mc_rbx;
-	int64 mc_rbp;
-	int64 mc_r10;
-	int64 mc_r11;
-	int64 mc_r12;
-	int64 mc_r13;
-	int64 mc_r14;
-	int64 mc_r15;
-	uint32 mc_trapno;
-	uint16 mc_fs;
-	uint16 mc_gs;
-	int64 mc_addr;
-	uint32 mc_flags;
-	uint16 mc_es;
-	uint16 mc_ds;
-	int64 mc_err;
-	int64 mc_rip;
-	int64 mc_cs;
-	int64 mc_rflags;
-	int64 mc_rsp;
-	int64 mc_ss;
-	int64 mc_len;
-	int64 mc_fpformat;
-	int64 mc_ownedfp;
-	int64 mc_fpstate[64];
-	int64 mc_fsbase;
-	int64 mc_gsbase;
-	int64 mc_spare[6];
+	int64	mc_onstack;
+	int64	mc_rdi;
+	int64	mc_rsi;
+	int64	mc_rdx;
+	int64	mc_rcx;
+	int64	mc_r8;
+	int64	mc_r9;
+	int64	mc_rax;
+	int64	mc_rbx;
+	int64	mc_rbp;
+	int64	mc_r10;
+	int64	mc_r11;
+	int64	mc_r12;
+	int64	mc_r13;
+	int64	mc_r14;
+	int64	mc_r15;
+	uint32	mc_trapno;
+	uint16	mc_fs;
+	uint16	mc_gs;
+	int64	mc_addr;
+	uint32	mc_flags;
+	uint16	mc_es;
+	uint16	mc_ds;
+	int64	mc_err;
+	int64	mc_rip;
+	int64	mc_cs;
+	int64	mc_rflags;
+	int64	mc_rsp;
+	int64	mc_ss;
+	int64	mc_len;
+	int64	mc_fpformat;
+	int64	mc_ownedfp;
+	int64	mc_fpstate[64];
+	int64	mc_fsbase;
+	int64	mc_gsbase;
+	int64	mc_spare[6];
 };
-
-typedef struct Ucontext Ucontext;
 struct Ucontext {
-	Sigset uc_sigmask;
-	Mcontext uc_mcontext;
-	Ucontext *uc_link;
-	StackT uc_stack;
-	int32 uc_flags;
-	int32 __spare__[4];
-	byte pad_godefs_0[12];
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	uc_flags;
+	int32	__spare__[4];
+	byte	Pad_cgo_0[12];
 };
 
-typedef struct Timespec Timespec;
 struct Timespec {
-	int64 tv_sec;
-	int64 tv_nsec;
+	int64	tv_sec;
+	int64	tv_nsec;
 };
-
-typedef struct Timeval Timeval;
 struct Timeval {
-	int64 tv_sec;
-	int64 tv_usec;
+	int64	tv_sec;
+	int64	tv_usec;
 };
-
-typedef struct Itimerval Itimerval;
 struct Itimerval {
-	Timeval it_interval;
-	Timeval it_value;
+	Timeval	it_interval;
+	Timeval	it_value;
 };
+
+
 #pragma pack off
diff --git a/src/pkg/runtime/defs_freebsd_arm.h b/src/pkg/runtime/defs_freebsd_arm.h
new file mode 100644
index 000000000..a744b808f
--- /dev/null
+++ b/src/pkg/runtime/defs_freebsd_arm.h
@@ -0,0 +1,163 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_freebsd.go
+
+
+enum {
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x5,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	UMTX_OP_WAIT	= 0x2,
+	UMTX_OP_WAKE	= 0x3,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x2,
+	FPE_INTOVF	= 0x1,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+};
+
+typedef struct Rtprio Rtprio;
+typedef struct ThrParam ThrParam;
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigset Sigset;
+typedef struct StackT StackT;
+typedef struct Siginfo Siginfo;
+typedef struct Mcontext Mcontext;
+typedef struct Ucontext Ucontext;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+
+#pragma pack on
+
+struct Rtprio {
+	uint16	type;
+	uint16	prio;
+};
+struct ThrParam {
+	void	*start_func;
+	byte	*arg;
+	uint8	*stack_base;
+	uint32	stack_size;
+	uint8	*tls_base;
+	uint32	tls_size;
+	int32	*child_tid;
+	int32	*parent_tid;
+	int32	flags;
+	Rtprio	*rtp;
+	void	*spare[3];
+};
+struct Sigaltstack {
+	uint8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct StackT {
+	uint8	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Siginfo {
+	int32	si_signo;
+	int32	si_errno;
+	int32	si_code;
+	int32	si_pid;
+	uint32	si_uid;
+	int32	si_status;
+	byte	*si_addr;
+	byte	si_value[4];
+	byte	_reason[32];
+};
+
+struct Mcontext {
+	uint32	__gregs[17];
+	byte	__fpu[140];
+};
+struct Ucontext {
+	Sigset	uc_sigmask;
+	Mcontext	uc_mcontext;
+	Ucontext	*uc_link;
+	StackT	uc_stack;
+	int32	uc_flags;
+	int32	__spare__[4];
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+
+#pragma pack off
diff --git a/src/pkg/runtime/defs_linux_386.h b/src/pkg/runtime/defs_linux_386.h
index 02760f987..e257a6f85 100644
--- a/src/pkg/runtime/defs_linux_386.h
+++ b/src/pkg/runtime/defs_linux_386.h
@@ -132,7 +132,7 @@ struct Sigaction {
 	void	*k_sa_handler;
 	uint32	sa_flags;
 	void	*sa_restorer;
-	uint32	sa_mask;
+	uint64	sa_mask;
 };
 struct Siginfo {
 	int32	si_signo;
diff --git a/src/pkg/runtime/defs_linux_arm.h b/src/pkg/runtime/defs_linux_arm.h
index 9e5c83a07..f72ec3d1b 100644
--- a/src/pkg/runtime/defs_linux_arm.h
+++ b/src/pkg/runtime/defs_linux_arm.h
@@ -143,6 +143,6 @@ struct Sigaction {
 	void *sa_handler;
 	uint32 sa_flags;
 	void *sa_restorer;
-	uint32 sa_mask;
+	uint64 sa_mask;
 };
 #pragma pack off
diff --git a/src/pkg/runtime/defs_netbsd.go b/src/pkg/runtime/defs_netbsd.go
index 47c30cf10..53e061041 100644
--- a/src/pkg/runtime/defs_netbsd.go
+++ b/src/pkg/runtime/defs_netbsd.go
@@ -7,18 +7,21 @@
 /*
 Input to cgo.
 
-GOARCH=amd64 cgo -cdefs defs.go >amd64/defs.h
-GOARCH=386 cgo -cdefs defs.go >386/defs.h
+GOARCH=amd64 go tool cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go >defs_netbsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_netbsd.go defs_netbsd_386.go >defs_netbsd_386.h
 */
 
+// +godefs map __fpregset_t [644]byte
+
 package runtime
 
 /*
 #include <sys/types.h>
 #include <sys/mman.h>
+#include <sys/signal.h>
 #include <sys/time.h>
+#include <sys/ucontext.h>
 #include <sys/unistd.h>
-#include <sys/signal.h>
 #include <errno.h>
 #include <signal.h>
 */
@@ -34,6 +37,8 @@ const (
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
+	MADV_FREE = C.MADV_FREE
+
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
 	SA_ONSTACK = C.SA_ONSTACK
@@ -95,8 +100,7 @@ const (
 
 type Sigaltstack C.struct_sigaltstack
 type Sigset C.sigset_t
-type Siginfo C.siginfo_t
-type Sigval C.union_sigval
+type Siginfo C.struct__ksiginfo
 
 type StackT C.stack_t
 
@@ -104,8 +108,5 @@ type Timespec C.struct_timespec
 type Timeval C.struct_timeval
 type Itimerval C.struct_itimerval
 
-// This is a hack to avoid pulling in machine/fpu.h.
-type sfxsave64 struct{}
-type usavefpu struct{}
-
-type Sigcontext C.struct_sigcontext
+type McontextT C.mcontext_t
+type UcontextT C.ucontext_t
diff --git a/src/pkg/runtime/defs_netbsd_386.go b/src/pkg/runtime/defs_netbsd_386.go
new file mode 100644
index 000000000..e9e36608e
--- /dev/null
+++ b/src/pkg/runtime/defs_netbsd_386.go
@@ -0,0 +1,42 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go >defs_netbsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_netbsd.go defs_netbsd_386.go >defs_netbsd_386.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <machine/mcontext.h>
+*/
+import "C"
+
+const (
+	REG_GS     = C._REG_GS
+	REG_FS     = C._REG_FS
+	REG_ES     = C._REG_ES
+	REG_DS     = C._REG_DS
+	REG_EDI    = C._REG_EDI
+	REG_ESI    = C._REG_ESI
+	REG_EBP    = C._REG_EBP
+	REG_ESP    = C._REG_ESP
+	REG_EBX    = C._REG_EBX
+	REG_EDX    = C._REG_EDX
+	REG_ECX    = C._REG_ECX
+	REG_EAX    = C._REG_EAX
+	REG_TRAPNO = C._REG_TRAPNO
+	REG_ERR    = C._REG_ERR
+	REG_EIP    = C._REG_EIP
+	REG_CS     = C._REG_CS
+	REG_EFL    = C._REG_EFL
+	REG_UESP   = C._REG_UESP
+	REG_SS     = C._REG_SS
+)
diff --git a/src/pkg/runtime/defs_netbsd_386.h b/src/pkg/runtime/defs_netbsd_386.h
index aff87fb3b..04c380e3f 100644
--- a/src/pkg/runtime/defs_netbsd_386.h
+++ b/src/pkg/runtime/defs_netbsd_386.h
@@ -1,146 +1,163 @@
-// godefs -f -m32 defs.c
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_386.go
 
-// MACHINE GENERATED - DO NOT EDIT.
 
-// Constants
 enum {
-	PROT_NONE = 0,
-	PROT_READ = 0x1,
-	PROT_WRITE = 0x2,
-	PROT_EXEC = 0x4,
-	MAP_ANON = 0x1000,
-	MAP_PRIVATE = 0x2,
-	MAP_FIXED = 0x10,
-	SA_SIGINFO = 0x40,
-	SA_RESTART = 0x2,
-	SA_ONSTACK = 0x1,
-	EINTR = 0x4,
-	SIGHUP = 0x1,
-	SIGINT = 0x2,
-	SIGQUIT = 0x3,
-	SIGILL = 0x4,
-	SIGTRAP = 0x5,
-	SIGABRT = 0x6,
-	SIGEMT = 0x7,
-	SIGFPE = 0x8,
-	SIGKILL = 0x9,
-	SIGBUS = 0xa,
-	SIGSEGV = 0xb,
-	SIGSYS = 0xc,
-	SIGPIPE = 0xd,
-	SIGALRM = 0xe,
-	SIGTERM = 0xf,
-	SIGURG = 0x10,
-	SIGSTOP = 0x11,
-	SIGTSTP = 0x12,
-	SIGCONT = 0x13,
-	SIGCHLD = 0x14,
-	SIGTTIN = 0x15,
-	SIGTTOU = 0x16,
-	SIGIO = 0x17,
-	SIGXCPU = 0x18,
-	SIGXFSZ = 0x19,
-	SIGVTALRM = 0x1a,
-	SIGPROF = 0x1b,
-	SIGWINCH = 0x1c,
-	SIGINFO = 0x1d,
-	SIGUSR1 = 0x1e,
-	SIGUSR2 = 0x1f,
-	FPE_INTDIV = 0x1,
-	FPE_INTOVF = 0x2,
-	FPE_FLTDIV = 0x3,
-	FPE_FLTOVF = 0x4,
-	FPE_FLTUND = 0x5,
-	FPE_FLTRES = 0x6,
-	FPE_FLTINV = 0x7,
-	FPE_FLTSUB = 0x8,
-	BUS_ADRALN = 0x1,
-	BUS_ADRERR = 0x2,
-	BUS_OBJERR = 0x3,
-	SEGV_MAPERR = 0x1,
-	SEGV_ACCERR = 0x2,
-	ITIMER_REAL = 0,
-	ITIMER_VIRTUAL = 0x1,
-	ITIMER_PROF = 0x2,
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
 };
 
-// Types
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigset Sigset;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct McontextT McontextT;
+typedef struct UcontextT UcontextT;
+
 #pragma pack on
 
-typedef struct Sigaltstack Sigaltstack;
 struct Sigaltstack {
-	void *ss_sp;
-	uint32 ss_size;
-	int32 ss_flags;
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
 };
-
-typedef uint32 Sigset;
-
-typedef struct Siginfo Siginfo;
-struct Siginfo {
-	int32 si_signo;
-	int32 si_code;
-	int32 si_errno;
-	byte _data[116];
+struct Sigset {
+	uint32	__bits[4];
 };
-
-typedef union Sigval Sigval;
-union Sigval {
-	int32 sival_int;
-	void *sival_ptr;
+struct Siginfo {
+	int32	_signo;
+	int32	_code;
+	int32	_errno;
+	byte	_reason[20];
 };
 
-typedef struct StackT StackT;
 struct StackT {
-	void *ss_sp;
-	uint32 ss_size;
-	int32 ss_flags;
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
 };
 
-typedef struct Timespec Timespec;
 struct Timespec {
-	int32 tv_sec;
-	int32 tv_nsec;
+	int64	tv_sec;
+	int32	tv_nsec;
 };
-
-typedef struct Timeval Timeval;
 struct Timeval {
-	int32 tv_sec;
-	int32 tv_usec;
+	int64	tv_sec;
+	int32	tv_usec;
 };
-
-typedef struct Itimerval Itimerval;
 struct Itimerval {
-	Timeval it_interval;
-	Timeval it_value;
+	Timeval	it_interval;
+	Timeval	it_value;
 };
 
-typedef void sfxsave64;
-
-typedef void usavefpu;
-
-typedef struct Sigcontext Sigcontext;
-struct Sigcontext {
-	int32 sc_gs;
-	int32 sc_fs;
-	int32 sc_es;
-	int32 sc_ds;
-	int32 sc_edi;
-	int32 sc_esi;
-	int32 sc_ebp;
-	int32 sc_ebx;
-	int32 sc_edx;
-	int32 sc_ecx;
-	int32 sc_eax;
-	int32 sc_eip;
-	int32 sc_cs;
-	int32 sc_eflags;
-	int32 sc_esp;
-	int32 sc_ss;
-	int32 sc_onstack;
-	int32 sc_mask;
-	int32 sc_trapno;
-	int32 sc_err;
-	usavefpu *sc_fpstate;
+struct McontextT {
+	int32	__gregs[19];
+	byte	__fpregs[644];
+	int32	_mc_tlsbase;
+};
+struct UcontextT {
+	uint32	uc_flags;
+	UcontextT	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	McontextT	uc_mcontext;
+	int32	__uc_pad[4];
 };
+
+
 #pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_386.go
+
+
+enum {
+	REG_GS		= 0x0,
+	REG_FS		= 0x1,
+	REG_ES		= 0x2,
+	REG_DS		= 0x3,
+	REG_EDI		= 0x4,
+	REG_ESI		= 0x5,
+	REG_EBP		= 0x6,
+	REG_ESP		= 0x7,
+	REG_EBX		= 0x8,
+	REG_EDX		= 0x9,
+	REG_ECX		= 0xa,
+	REG_EAX		= 0xb,
+	REG_TRAPNO	= 0xc,
+	REG_ERR		= 0xd,
+	REG_EIP		= 0xe,
+	REG_CS		= 0xf,
+	REG_EFL		= 0x10,
+	REG_UESP	= 0x11,
+	REG_SS		= 0x12,
+};
+
diff --git a/src/pkg/runtime/defs_netbsd_amd64.go b/src/pkg/runtime/defs_netbsd_amd64.go
new file mode 100644
index 000000000..68f586b2f
--- /dev/null
+++ b/src/pkg/runtime/defs_netbsd_amd64.go
@@ -0,0 +1,49 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+/*
+Input to cgo.
+
+GOARCH=amd64 go tool cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go >defs_netbsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_netbsd.go defs_netbsd_386.go >defs_netbsd_386.h
+*/
+
+package runtime
+
+/*
+#include <sys/types.h>
+#include <machine/mcontext.h>
+*/
+import "C"
+
+const (
+	REG_RDI    = C._REG_RDI
+	REG_RSI    = C._REG_RSI
+	REG_RDX    = C._REG_RDX
+	REG_RCX    = C._REG_RCX
+	REG_R8     = C._REG_R8
+	REG_R9     = C._REG_R9
+	REG_R10    = C._REG_R10
+	REG_R11    = C._REG_R11
+	REG_R12    = C._REG_R12
+	REG_R13    = C._REG_R13
+	REG_R14    = C._REG_R14
+	REG_R15    = C._REG_R15
+	REG_RBP    = C._REG_RBP
+	REG_RBX    = C._REG_RBX
+	REG_RAX    = C._REG_RAX
+	REG_GS     = C._REG_GS
+	REG_FS     = C._REG_FS
+	REG_ES     = C._REG_ES
+	REG_DS     = C._REG_DS
+	REG_TRAPNO = C._REG_TRAPNO
+	REG_ERR    = C._REG_ERR
+	REG_RIP    = C._REG_RIP
+	REG_CS     = C._REG_CS
+	REG_RFLAGS = C._REG_RFLAGS
+	REG_RSP    = C._REG_RSP
+	REG_SS     = C._REG_SS
+)
diff --git a/src/pkg/runtime/defs_netbsd_amd64.h b/src/pkg/runtime/defs_netbsd_amd64.h
index 27bf4b9d6..3d3f576d3 100644
--- a/src/pkg/runtime/defs_netbsd_amd64.h
+++ b/src/pkg/runtime/defs_netbsd_amd64.h
@@ -1,158 +1,174 @@
-// godefs -f -m64 defs.c
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go
 
-// MACHINE GENERATED - DO NOT EDIT.
 
-// Constants
 enum {
-	PROT_NONE = 0,
-	PROT_READ = 0x1,
-	PROT_WRITE = 0x2,
-	PROT_EXEC = 0x4,
-	MAP_ANON = 0x1000,
-	MAP_PRIVATE = 0x2,
-	MAP_FIXED = 0x10,
-	SA_SIGINFO = 0x40,
-	SA_RESTART = 0x2,
-	SA_ONSTACK = 0x1,
-	EINTR = 0x4,
-	SIGHUP = 0x1,
-	SIGINT = 0x2,
-	SIGQUIT = 0x3,
-	SIGILL = 0x4,
-	SIGTRAP = 0x5,
-	SIGABRT = 0x6,
-	SIGEMT = 0x7,
-	SIGFPE = 0x8,
-	SIGKILL = 0x9,
-	SIGBUS = 0xa,
-	SIGSEGV = 0xb,
-	SIGSYS = 0xc,
-	SIGPIPE = 0xd,
-	SIGALRM = 0xe,
-	SIGTERM = 0xf,
-	SIGURG = 0x10,
-	SIGSTOP = 0x11,
-	SIGTSTP = 0x12,
-	SIGCONT = 0x13,
-	SIGCHLD = 0x14,
-	SIGTTIN = 0x15,
-	SIGTTOU = 0x16,
-	SIGIO = 0x17,
-	SIGXCPU = 0x18,
-	SIGXFSZ = 0x19,
-	SIGVTALRM = 0x1a,
-	SIGPROF = 0x1b,
-	SIGWINCH = 0x1c,
-	SIGINFO = 0x1d,
-	SIGUSR1 = 0x1e,
-	SIGUSR2 = 0x1f,
-	FPE_INTDIV = 0x1,
-	FPE_INTOVF = 0x2,
-	FPE_FLTDIV = 0x3,
-	FPE_FLTOVF = 0x4,
-	FPE_FLTUND = 0x5,
-	FPE_FLTRES = 0x6,
-	FPE_FLTINV = 0x7,
-	FPE_FLTSUB = 0x8,
-	BUS_ADRALN = 0x1,
-	BUS_ADRERR = 0x2,
-	BUS_OBJERR = 0x3,
-	SEGV_MAPERR = 0x1,
-	SEGV_ACCERR = 0x2,
-	ITIMER_REAL = 0,
-	ITIMER_VIRTUAL = 0x1,
-	ITIMER_PROF = 0x2,
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
 };
 
-// Types
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigset Sigset;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct McontextT McontextT;
+typedef struct UcontextT UcontextT;
+
 #pragma pack on
 
-typedef struct Sigaltstack Sigaltstack;
 struct Sigaltstack {
-	void *ss_sp;
-	uint64 ss_size;
-	int32 ss_flags;
-	byte pad_godefs_0[4];
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
 };
-
-typedef uint32 Sigset;
-
-typedef struct Siginfo Siginfo;
-struct Siginfo {
-	int32 si_signo;
-	int32 si_code;
-	int32 si_errno;
-	byte pad_godefs_0[4];
-	byte _data[120];
+struct Sigset {
+	uint32	__bits[4];
 };
-
-typedef union Sigval Sigval;
-union Sigval {
-	int32 sival_int;
-	void *sival_ptr;
+struct Siginfo {
+	int32	_signo;
+	int32	_code;
+	int32	_errno;
+	int32	_pad;
+	byte	_reason[24];
 };
 
-typedef struct StackT StackT;
 struct StackT {
-	void *ss_sp;
-	uint64 ss_size;
-	int32 ss_flags;
-	byte pad_godefs_0[4];
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
 };
 
-typedef struct Timespec Timespec;
 struct Timespec {
-	int32 tv_sec;
-	byte pad_godefs_0[4];
-	int64 tv_nsec;
+	int64	tv_sec;
+	int64	tv_nsec;
 };
-
-typedef struct Timeval Timeval;
 struct Timeval {
-	int64 tv_sec;
-	int64 tv_usec;
+	int64	tv_sec;
+	int32	tv_usec;
+	byte	Pad_cgo_0[4];
 };
-
-typedef struct Itimerval Itimerval;
 struct Itimerval {
-	Timeval it_interval;
-	Timeval it_value;
+	Timeval	it_interval;
+	Timeval	it_value;
 };
 
-typedef void sfxsave64;
-
-typedef void usavefpu;
-
-typedef struct Sigcontext Sigcontext;
-struct Sigcontext {
-	int64 sc_rdi;
-	int64 sc_rsi;
-	int64 sc_rdx;
-	int64 sc_rcx;
-	int64 sc_r8;
-	int64 sc_r9;
-	int64 sc_r10;
-	int64 sc_r11;
-	int64 sc_r12;
-	int64 sc_r13;
-	int64 sc_r14;
-	int64 sc_r15;
-	int64 sc_rbp;
-	int64 sc_rbx;
-	int64 sc_rax;
-	int64 sc_gs;
-	int64 sc_fs;
-	int64 sc_es;
-	int64 sc_ds;
-	int64 sc_trapno;
-	int64 sc_err;
-	int64 sc_rip;
-	int64 sc_cs;
-	int64 sc_rflags;
-	int64 sc_rsp;
-	int64 sc_ss;
-	sfxsave64 *sc_fpstate;
-	int32 sc_onstack;
-	int32 sc_mask;
+struct McontextT {
+	uint64	__gregs[26];
+	uint64	_mc_tlsbase;
+	int8	__fpregs[512];
+};
+struct UcontextT {
+	uint32	uc_flags;
+	byte	Pad_cgo_0[4];
+	UcontextT	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	McontextT	uc_mcontext;
 };
+
+
 #pragma pack off
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go defs_netbsd_amd64.go
+
+
+enum {
+	REG_RDI		= 0x0,
+	REG_RSI		= 0x1,
+	REG_RDX		= 0x2,
+	REG_RCX		= 0x3,
+	REG_R8		= 0x4,
+	REG_R9		= 0x5,
+	REG_R10		= 0x6,
+	REG_R11		= 0x7,
+	REG_R12		= 0x8,
+	REG_R13		= 0x9,
+	REG_R14		= 0xa,
+	REG_R15		= 0xb,
+	REG_RBP		= 0xc,
+	REG_RBX		= 0xd,
+	REG_RAX		= 0xe,
+	REG_GS		= 0xf,
+	REG_FS		= 0x10,
+	REG_ES		= 0x11,
+	REG_DS		= 0x12,
+	REG_TRAPNO	= 0x13,
+	REG_ERR		= 0x14,
+	REG_RIP		= 0x15,
+	REG_CS		= 0x16,
+	REG_RFLAGS	= 0x17,
+	REG_RSP		= 0x18,
+	REG_SS		= 0x19,
+};
+
diff --git a/src/pkg/runtime/defs_netbsd_arm.h b/src/pkg/runtime/defs_netbsd_arm.h
new file mode 100644
index 000000000..f67475c76
--- /dev/null
+++ b/src/pkg/runtime/defs_netbsd_arm.h
@@ -0,0 +1,140 @@
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_netbsd.go
+
+
+enum {
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
+
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
+
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
+
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
+
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+};
+
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigset Sigset;
+typedef struct Siginfo Siginfo;
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+typedef struct McontextT McontextT;
+typedef struct UcontextT UcontextT;
+
+#pragma pack on
+
+struct Sigaltstack {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+struct Sigset {
+	uint32	__bits[4];
+};
+struct Siginfo {
+	int32	_signo;
+	int32	_code;
+	int32	_errno;
+	byte	_reason[20];
+};
+
+struct StackT {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
+};
+
+struct Timespec {
+	int64	tv_sec;
+	int32	tv_nsec;
+};
+struct Timeval {
+	int64	tv_sec;
+	int32	tv_usec;
+};
+struct Itimerval {
+	Timeval	it_interval;
+	Timeval	it_value;
+};
+
+struct McontextT {
+	uint32	__gregs[17];
+#ifdef __ARM_EABI__
+	byte	__fpu[4+8*32+4];
+#else
+	byte	__fpu[4+4*33+4];
+#endif
+	uint32	_mc_tlsbase;
+};
+struct UcontextT {
+	uint32	uc_flags;
+	UcontextT	*uc_link;
+	Sigset	uc_sigmask;
+	StackT	uc_stack;
+	McontextT	uc_mcontext;
+	int32	__uc_pad[2];
+};
+
+#pragma pack off
diff --git a/src/pkg/runtime/defs_openbsd.go b/src/pkg/runtime/defs_openbsd.go
index 47c30cf10..ff94b9405 100644
--- a/src/pkg/runtime/defs_openbsd.go
+++ b/src/pkg/runtime/defs_openbsd.go
@@ -7,8 +7,8 @@
 /*
 Input to cgo.
 
-GOARCH=amd64 cgo -cdefs defs.go >amd64/defs.h
-GOARCH=386 cgo -cdefs defs.go >386/defs.h
+GOARCH=amd64 go tool cgo -cdefs defs_openbsd.go >defs_openbsd_amd64.h
+GOARCH=386 go tool cgo -cdefs defs_openbsd.go >defs_openbsd_386.h
 */
 
 package runtime
@@ -34,6 +34,8 @@ const (
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
+	MADV_FREE = C.MADV_FREE
+
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
 	SA_ONSTACK = C.SA_ONSTACK
@@ -93,9 +95,12 @@ const (
 	ITIMER_PROF    = C.ITIMER_PROF
 )
 
+type Tfork C.struct___tfork
+
 type Sigaltstack C.struct_sigaltstack
-type Sigset C.sigset_t
+type Sigcontext C.struct_sigcontext
 type Siginfo C.siginfo_t
+type Sigset C.sigset_t
 type Sigval C.union_sigval
 
 type StackT C.stack_t
@@ -103,9 +108,3 @@ type StackT C.stack_t
 type Timespec C.struct_timespec
 type Timeval C.struct_timeval
 type Itimerval C.struct_itimerval
-
-// This is a hack to avoid pulling in machine/fpu.h.
-type sfxsave64 struct{}
-type usavefpu struct{}
-
-type Sigcontext C.struct_sigcontext
diff --git a/src/pkg/runtime/defs_openbsd_386.h b/src/pkg/runtime/defs_openbsd_386.h
index aff87fb3b..323bb084a 100644
--- a/src/pkg/runtime/defs_openbsd_386.h
+++ b/src/pkg/runtime/defs_openbsd_386.h
@@ -1,146 +1,150 @@
-// godefs -f -m32 defs.c
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_openbsd.go
 
-// MACHINE GENERATED - DO NOT EDIT.
 
-// Constants
 enum {
-	PROT_NONE = 0,
-	PROT_READ = 0x1,
-	PROT_WRITE = 0x2,
-	PROT_EXEC = 0x4,
-	MAP_ANON = 0x1000,
-	MAP_PRIVATE = 0x2,
-	MAP_FIXED = 0x10,
-	SA_SIGINFO = 0x40,
-	SA_RESTART = 0x2,
-	SA_ONSTACK = 0x1,
-	EINTR = 0x4,
-	SIGHUP = 0x1,
-	SIGINT = 0x2,
-	SIGQUIT = 0x3,
-	SIGILL = 0x4,
-	SIGTRAP = 0x5,
-	SIGABRT = 0x6,
-	SIGEMT = 0x7,
-	SIGFPE = 0x8,
-	SIGKILL = 0x9,
-	SIGBUS = 0xa,
-	SIGSEGV = 0xb,
-	SIGSYS = 0xc,
-	SIGPIPE = 0xd,
-	SIGALRM = 0xe,
-	SIGTERM = 0xf,
-	SIGURG = 0x10,
-	SIGSTOP = 0x11,
-	SIGTSTP = 0x12,
-	SIGCONT = 0x13,
-	SIGCHLD = 0x14,
-	SIGTTIN = 0x15,
-	SIGTTOU = 0x16,
-	SIGIO = 0x17,
-	SIGXCPU = 0x18,
-	SIGXFSZ = 0x19,
-	SIGVTALRM = 0x1a,
-	SIGPROF = 0x1b,
-	SIGWINCH = 0x1c,
-	SIGINFO = 0x1d,
-	SIGUSR1 = 0x1e,
-	SIGUSR2 = 0x1f,
-	FPE_INTDIV = 0x1,
-	FPE_INTOVF = 0x2,
-	FPE_FLTDIV = 0x3,
-	FPE_FLTOVF = 0x4,
-	FPE_FLTUND = 0x5,
-	FPE_FLTRES = 0x6,
-	FPE_FLTINV = 0x7,
-	FPE_FLTSUB = 0x8,
-	BUS_ADRALN = 0x1,
-	BUS_ADRERR = 0x2,
-	BUS_OBJERR = 0x3,
-	SEGV_MAPERR = 0x1,
-	SEGV_ACCERR = 0x2,
-	ITIMER_REAL = 0,
-	ITIMER_VIRTUAL = 0x1,
-	ITIMER_PROF = 0x2,
-};
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
 
-// Types
-#pragma pack on
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
 
-typedef struct Sigaltstack Sigaltstack;
-struct Sigaltstack {
-	void *ss_sp;
-	uint32 ss_size;
-	int32 ss_flags;
-};
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
 
-typedef uint32 Sigset;
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
 
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+};
+
+typedef struct Tfork Tfork;
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigcontext Sigcontext;
 typedef struct Siginfo Siginfo;
-struct Siginfo {
-	int32 si_signo;
-	int32 si_code;
-	int32 si_errno;
-	byte _data[116];
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+
+#pragma pack on
+
+struct Tfork {
+	byte	*tf_tcb;
+	int32	*tf_tid;
+	byte	*tf_stack;
 };
 
-typedef union Sigval Sigval;
-union Sigval {
-	int32 sival_int;
-	void *sival_ptr;
+struct Sigaltstack {
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
 };
+struct Sigcontext {
+	int32	sc_gs;
+	int32	sc_fs;
+	int32	sc_es;
+	int32	sc_ds;
+	int32	sc_edi;
+	int32	sc_esi;
+	int32	sc_ebp;
+	int32	sc_ebx;
+	int32	sc_edx;
+	int32	sc_ecx;
+	int32	sc_eax;
+	int32	sc_eip;
+	int32	sc_cs;
+	int32	sc_eflags;
+	int32	sc_esp;
+	int32	sc_ss;
+	int32	sc_onstack;
+	int32	sc_mask;
+	int32	sc_trapno;
+	int32	sc_err;
+	void	*sc_fpstate;
+};
+struct Siginfo {
+	int32	si_signo;
+	int32	si_code;
+	int32	si_errno;
+	byte	_data[116];
+};
+typedef	uint32	Sigset;
+typedef	byte	Sigval[4];
 
-typedef struct StackT StackT;
 struct StackT {
-	void *ss_sp;
-	uint32 ss_size;
-	int32 ss_flags;
+	byte	*ss_sp;
+	uint32	ss_size;
+	int32	ss_flags;
 };
 
-typedef struct Timespec Timespec;
 struct Timespec {
-	int32 tv_sec;
-	int32 tv_nsec;
+	int32	tv_sec;
+	int32	tv_nsec;
 };
-
-typedef struct Timeval Timeval;
 struct Timeval {
-	int32 tv_sec;
-	int32 tv_usec;
+	int32	tv_sec;
+	int32	tv_usec;
 };
-
-typedef struct Itimerval Itimerval;
 struct Itimerval {
-	Timeval it_interval;
-	Timeval it_value;
+	Timeval	it_interval;
+	Timeval	it_value;
 };
 
-typedef void sfxsave64;
 
-typedef void usavefpu;
-
-typedef struct Sigcontext Sigcontext;
-struct Sigcontext {
-	int32 sc_gs;
-	int32 sc_fs;
-	int32 sc_es;
-	int32 sc_ds;
-	int32 sc_edi;
-	int32 sc_esi;
-	int32 sc_ebp;
-	int32 sc_ebx;
-	int32 sc_edx;
-	int32 sc_ecx;
-	int32 sc_eax;
-	int32 sc_eip;
-	int32 sc_cs;
-	int32 sc_eflags;
-	int32 sc_esp;
-	int32 sc_ss;
-	int32 sc_onstack;
-	int32 sc_mask;
-	int32 sc_trapno;
-	int32 sc_err;
-	usavefpu *sc_fpstate;
-};
 #pragma pack off
diff --git a/src/pkg/runtime/defs_openbsd_amd64.h b/src/pkg/runtime/defs_openbsd_amd64.h
index 27bf4b9d6..429cc99f0 100644
--- a/src/pkg/runtime/defs_openbsd_amd64.h
+++ b/src/pkg/runtime/defs_openbsd_amd64.h
@@ -1,158 +1,162 @@
-// godefs -f -m64 defs.c
+// Created by cgo -cdefs - DO NOT EDIT
+// cgo -cdefs defs_openbsd.go
 
-// MACHINE GENERATED - DO NOT EDIT.
 
-// Constants
 enum {
-	PROT_NONE = 0,
-	PROT_READ = 0x1,
-	PROT_WRITE = 0x2,
-	PROT_EXEC = 0x4,
-	MAP_ANON = 0x1000,
-	MAP_PRIVATE = 0x2,
-	MAP_FIXED = 0x10,
-	SA_SIGINFO = 0x40,
-	SA_RESTART = 0x2,
-	SA_ONSTACK = 0x1,
-	EINTR = 0x4,
-	SIGHUP = 0x1,
-	SIGINT = 0x2,
-	SIGQUIT = 0x3,
-	SIGILL = 0x4,
-	SIGTRAP = 0x5,
-	SIGABRT = 0x6,
-	SIGEMT = 0x7,
-	SIGFPE = 0x8,
-	SIGKILL = 0x9,
-	SIGBUS = 0xa,
-	SIGSEGV = 0xb,
-	SIGSYS = 0xc,
-	SIGPIPE = 0xd,
-	SIGALRM = 0xe,
-	SIGTERM = 0xf,
-	SIGURG = 0x10,
-	SIGSTOP = 0x11,
-	SIGTSTP = 0x12,
-	SIGCONT = 0x13,
-	SIGCHLD = 0x14,
-	SIGTTIN = 0x15,
-	SIGTTOU = 0x16,
-	SIGIO = 0x17,
-	SIGXCPU = 0x18,
-	SIGXFSZ = 0x19,
-	SIGVTALRM = 0x1a,
-	SIGPROF = 0x1b,
-	SIGWINCH = 0x1c,
-	SIGINFO = 0x1d,
-	SIGUSR1 = 0x1e,
-	SIGUSR2 = 0x1f,
-	FPE_INTDIV = 0x1,
-	FPE_INTOVF = 0x2,
-	FPE_FLTDIV = 0x3,
-	FPE_FLTOVF = 0x4,
-	FPE_FLTUND = 0x5,
-	FPE_FLTRES = 0x6,
-	FPE_FLTINV = 0x7,
-	FPE_FLTSUB = 0x8,
-	BUS_ADRALN = 0x1,
-	BUS_ADRERR = 0x2,
-	BUS_OBJERR = 0x3,
-	SEGV_MAPERR = 0x1,
-	SEGV_ACCERR = 0x2,
-	ITIMER_REAL = 0,
-	ITIMER_VIRTUAL = 0x1,
-	ITIMER_PROF = 0x2,
-};
+	PROT_NONE	= 0x0,
+	PROT_READ	= 0x1,
+	PROT_WRITE	= 0x2,
+	PROT_EXEC	= 0x4,
 
-// Types
-#pragma pack on
+	MAP_ANON	= 0x1000,
+	MAP_PRIVATE	= 0x2,
+	MAP_FIXED	= 0x10,
 
-typedef struct Sigaltstack Sigaltstack;
-struct Sigaltstack {
-	void *ss_sp;
-	uint64 ss_size;
-	int32 ss_flags;
-	byte pad_godefs_0[4];
-};
+	MADV_FREE	= 0x6,
+
+	SA_SIGINFO	= 0x40,
+	SA_RESTART	= 0x2,
+	SA_ONSTACK	= 0x1,
+
+	EINTR	= 0x4,
+
+	SIGHUP		= 0x1,
+	SIGINT		= 0x2,
+	SIGQUIT		= 0x3,
+	SIGILL		= 0x4,
+	SIGTRAP		= 0x5,
+	SIGABRT		= 0x6,
+	SIGEMT		= 0x7,
+	SIGFPE		= 0x8,
+	SIGKILL		= 0x9,
+	SIGBUS		= 0xa,
+	SIGSEGV		= 0xb,
+	SIGSYS		= 0xc,
+	SIGPIPE		= 0xd,
+	SIGALRM		= 0xe,
+	SIGTERM		= 0xf,
+	SIGURG		= 0x10,
+	SIGSTOP		= 0x11,
+	SIGTSTP		= 0x12,
+	SIGCONT		= 0x13,
+	SIGCHLD		= 0x14,
+	SIGTTIN		= 0x15,
+	SIGTTOU		= 0x16,
+	SIGIO		= 0x17,
+	SIGXCPU		= 0x18,
+	SIGXFSZ		= 0x19,
+	SIGVTALRM	= 0x1a,
+	SIGPROF		= 0x1b,
+	SIGWINCH	= 0x1c,
+	SIGINFO		= 0x1d,
+	SIGUSR1		= 0x1e,
+	SIGUSR2		= 0x1f,
+
+	FPE_INTDIV	= 0x1,
+	FPE_INTOVF	= 0x2,
+	FPE_FLTDIV	= 0x3,
+	FPE_FLTOVF	= 0x4,
+	FPE_FLTUND	= 0x5,
+	FPE_FLTRES	= 0x6,
+	FPE_FLTINV	= 0x7,
+	FPE_FLTSUB	= 0x8,
+
+	BUS_ADRALN	= 0x1,
+	BUS_ADRERR	= 0x2,
+	BUS_OBJERR	= 0x3,
 
-typedef uint32 Sigset;
+	SEGV_MAPERR	= 0x1,
+	SEGV_ACCERR	= 0x2,
 
+	ITIMER_REAL	= 0x0,
+	ITIMER_VIRTUAL	= 0x1,
+	ITIMER_PROF	= 0x2,
+};
+
+typedef struct Tfork Tfork;
+typedef struct Sigaltstack Sigaltstack;
+typedef struct Sigcontext Sigcontext;
 typedef struct Siginfo Siginfo;
-struct Siginfo {
-	int32 si_signo;
-	int32 si_code;
-	int32 si_errno;
-	byte pad_godefs_0[4];
-	byte _data[120];
+typedef struct StackT StackT;
+typedef struct Timespec Timespec;
+typedef struct Timeval Timeval;
+typedef struct Itimerval Itimerval;
+
+#pragma pack on
+
+struct Tfork {
+	byte	*tf_tcb;
+	int32	*tf_tid;
+	byte	*tf_stack;
 };
 
-typedef union Sigval Sigval;
-union Sigval {
-	int32 sival_int;
-	void *sival_ptr;
+struct Sigaltstack {
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
 };
+struct Sigcontext {
+	int64	sc_rdi;
+	int64	sc_rsi;
+	int64	sc_rdx;
+	int64	sc_rcx;
+	int64	sc_r8;
+	int64	sc_r9;
+	int64	sc_r10;
+	int64	sc_r11;
+	int64	sc_r12;
+	int64	sc_r13;
+	int64	sc_r14;
+	int64	sc_r15;
+	int64	sc_rbp;
+	int64	sc_rbx;
+	int64	sc_rax;
+	int64	sc_gs;
+	int64	sc_fs;
+	int64	sc_es;
+	int64	sc_ds;
+	int64	sc_trapno;
+	int64	sc_err;
+	int64	sc_rip;
+	int64	sc_cs;
+	int64	sc_rflags;
+	int64	sc_rsp;
+	int64	sc_ss;
+	void	*sc_fpstate;
+	int32	sc_onstack;
+	int32	sc_mask;
+};
+struct Siginfo {
+	int32	si_signo;
+	int32	si_code;
+	int32	si_errno;
+	byte	Pad_cgo_0[4];
+	byte	_data[120];
+};
+typedef	uint32	Sigset;
+typedef	byte	Sigval[8];
 
-typedef struct StackT StackT;
 struct StackT {
-	void *ss_sp;
-	uint64 ss_size;
-	int32 ss_flags;
-	byte pad_godefs_0[4];
+	byte	*ss_sp;
+	uint64	ss_size;
+	int32	ss_flags;
+	byte	Pad_cgo_0[4];
 };
 
-typedef struct Timespec Timespec;
 struct Timespec {
-	int32 tv_sec;
-	byte pad_godefs_0[4];
-	int64 tv_nsec;
+	int32	tv_sec;
+	byte	Pad_cgo_0[4];
+	int64	tv_nsec;
 };
-
-typedef struct Timeval Timeval;
 struct Timeval {
-	int64 tv_sec;
-	int64 tv_usec;
+	int64	tv_sec;
+	int64	tv_usec;
 };
-
-typedef struct Itimerval Itimerval;
 struct Itimerval {
-	Timeval it_interval;
-	Timeval it_value;
+	Timeval	it_interval;
+	Timeval	it_value;
 };
 
-typedef void sfxsave64;
 
-typedef void usavefpu;
-
-typedef struct Sigcontext Sigcontext;
-struct Sigcontext {
-	int64 sc_rdi;
-	int64 sc_rsi;
-	int64 sc_rdx;
-	int64 sc_rcx;
-	int64 sc_r8;
-	int64 sc_r9;
-	int64 sc_r10;
-	int64 sc_r11;
-	int64 sc_r12;
-	int64 sc_r13;
-	int64 sc_r14;
-	int64 sc_r15;
-	int64 sc_rbp;
-	int64 sc_rbx;
-	int64 sc_rax;
-	int64 sc_gs;
-	int64 sc_fs;
-	int64 sc_es;
-	int64 sc_ds;
-	int64 sc_trapno;
-	int64 sc_err;
-	int64 sc_rip;
-	int64 sc_cs;
-	int64 sc_rflags;
-	int64 sc_rsp;
-	int64 sc_ss;
-	sfxsave64 *sc_fpstate;
-	int32 sc_onstack;
-	int32 sc_mask;
-};
 #pragma pack off
diff --git a/src/pkg/runtime/defs_plan9_386.h b/src/pkg/runtime/defs_plan9_386.h
index 58fd9d94d..bde299dee 100644
--- a/src/pkg/runtime/defs_plan9_386.h
+++ b/src/pkg/runtime/defs_plan9_386.h
@@ -1,2 +1,29 @@
-// nothing to see here
-#define tos_pid 48
+#define PAGESIZE 0x1000
+
+typedef struct Ureg Ureg;
+
+struct Ureg
+{
+	uint32	di;		/* general registers */
+	uint32	si;		/* ... */
+	uint32	bp;		/* ... */
+	uint32	nsp;
+	uint32	bx;		/* ... */
+	uint32	dx;		/* ... */
+	uint32	cx;		/* ... */
+	uint32	ax;		/* ... */
+	uint32	gs;		/* data segments */
+	uint32	fs;		/* ... */
+	uint32	es;		/* ... */
+	uint32	ds;		/* ... */
+	uint32	trap;		/* trap type */
+	uint32	ecode;		/* error code (or zero) */
+	uint32	pc;		/* pc */
+	uint32	cs;		/* old context */
+	uint32	flags;		/* old flags */
+	union {
+		uint32	usp;
+		uint32	sp;
+	};
+	uint32	ss;		/* old stack segment */
+};
diff --git a/src/pkg/runtime/defs_plan9_amd64.h b/src/pkg/runtime/defs_plan9_amd64.h
new file mode 100644
index 000000000..d8fec67eb
--- /dev/null
+++ b/src/pkg/runtime/defs_plan9_amd64.h
@@ -0,0 +1,34 @@
+#define PAGESIZE 0x200000ULL
+
+typedef struct Ureg Ureg;
+
+struct Ureg {
+	uint64	ax;
+	uint64	bx;
+	uint64	cx;
+	uint64	dx;
+	uint64	si;
+	uint64	di;
+	uint64	bp;
+	uint64	r8;
+	uint64	r9;
+	uint64	r10;
+	uint64	r11;
+	uint64	r12;
+	uint64	r13;
+	uint64	r14;
+	uint64	r15;
+
+	uint16	ds;
+	uint16	es;
+	uint16	fs;
+	uint16	gs;
+
+	uint64	type;
+	uint64	error;				/* error code (or zero) */
+	uint64	ip;				/* pc */
+	uint64	cs;				/* old context */
+	uint64	flags;				/* old flags */
+	uint64	sp;				/* sp */
+	uint64	ss;				/* old stack segment */
+};
diff --git a/src/pkg/runtime/env_plan9.c b/src/pkg/runtime/env_plan9.c
new file mode 100644
index 000000000..848d73303
--- /dev/null
+++ b/src/pkg/runtime/env_plan9.c
@@ -0,0 +1,33 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "os_GOOS.h"
+
+byte*
+runtime·getenv(int8 *s)
+{
+	int32 fd, len, n, r;
+	byte file[128];
+	byte *p;
+
+	len = runtime·findnull((byte*)s);
+	if(len > sizeof file-6)
+		return nil;
+
+	runtime·memclr(file, sizeof file);
+	runtime·memmove((void*)file, (void*)"/env/", 5);
+	runtime·memmove((void*)(file+5), (void*)s, len);
+
+	fd = runtime·open(file, OREAD);
+	if(fd < 0)
+		return nil;
+	n = runtime·seek(fd, 0, 2);
+	p = runtime·malloc(n+1);
+	r = runtime·pread(fd, p, n, 0);
+	runtime·close(fd);
+	if(r < 0)
+		return nil;
+	return p;
+}
diff --git a/src/pkg/runtime/env_posix.c b/src/pkg/runtime/env_posix.c
new file mode 100644
index 000000000..8333811fb
--- /dev/null
+++ b/src/pkg/runtime/env_posix.c
@@ -0,0 +1,61 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin freebsd linux netbsd openbsd windows
+
+#include "runtime.h"
+
+Slice syscall·envs;
+
+byte*
+runtime·getenv(int8 *s)
+{
+	int32 i, j, len;
+	byte *v, *bs;
+	String* envv;
+	int32 envc;
+
+	bs = (byte*)s;
+	len = runtime·findnull(bs);
+	envv = (String*)syscall·envs.array;
+	envc = syscall·envs.len;
+	for(i=0; i<envc; i++){
+		if(envv[i].len <= len)
+			continue;
+		v = envv[i].str;
+		for(j=0; j<len; j++)
+			if(bs[j] != v[j])
+				goto nomatch;
+		if(v[len] != '=')
+			goto nomatch;
+		return v+len+1;
+	nomatch:;
+	}
+	return nil;
+}
+
+void (*_cgo_setenv)(byte**);
+
+// Update the C environment if cgo is loaded.
+// Called from syscall.Setenv.
+void
+syscall·setenv_c(String k, String v)
+{
+	byte *arg[2];
+
+	if(_cgo_setenv == nil)
+		return;
+
+	arg[0] = runtime·malloc(k.len + 1);
+	runtime·memmove(arg[0], k.str, k.len);
+	arg[0][k.len] = 0;
+
+	arg[1] = runtime·malloc(v.len + 1);
+	runtime·memmove(arg[1], v.str, v.len);
+	arg[1][v.len] = 0;
+
+	runtime·asmcgocall((void*)_cgo_setenv, arg);
+	runtime·free(arg[0]);
+	runtime·free(arg[1]);
+}
diff --git a/src/pkg/runtime/export_test.go b/src/pkg/runtime/export_test.go
index 51921135b..062aea248 100644
--- a/src/pkg/runtime/export_test.go
+++ b/src/pkg/runtime/export_test.go
@@ -25,3 +25,45 @@ var Entersyscall = entersyscall
 var Exitsyscall = exitsyscall
 var LockedOSThread = golockedOSThread
 var Stackguard = stackguard
+
+type LFNode struct {
+	Next    *LFNode
+	Pushcnt uintptr
+}
+
+func lfstackpush(head *uint64, node *LFNode)
+func lfstackpop2(head *uint64) *LFNode
+
+var LFStackPush = lfstackpush
+var LFStackPop = lfstackpop2
+
+type ParFor struct {
+	body    *byte
+	done    uint32
+	Nthr    uint32
+	nthrmax uint32
+	thrseq  uint32
+	Cnt     uint32
+	Ctx     *byte
+	wait    bool
+}
+
+func parforalloc2(nthrmax uint32) *ParFor
+func parforsetup2(desc *ParFor, nthr, n uint32, ctx *byte, wait bool, body func(*ParFor, uint32))
+func parfordo(desc *ParFor)
+func parforiters(desc *ParFor, tid uintptr) (uintptr, uintptr)
+
+var NewParFor = parforalloc2
+var ParForSetup = parforsetup2
+var ParForDo = parfordo
+
+func ParForIters(desc *ParFor, tid uint32) (uint32, uint32) {
+	begin, end := parforiters(desc, uintptr(tid))
+	return uint32(begin), uint32(end)
+}
+
+func testSchedLocalQueue()
+func testSchedLocalQueueSteal()
+
+var TestSchedLocalQueue1 = testSchedLocalQueue
+var TestSchedLocalQueueSteal1 = testSchedLocalQueueSteal
diff --git a/src/pkg/runtime/extern.go b/src/pkg/runtime/extern.go
index d93259d7b..fbaffd1d5 100644
--- a/src/pkg/runtime/extern.go
+++ b/src/pkg/runtime/extern.go
@@ -42,8 +42,8 @@ type Func struct { // Keep in sync with runtime.h:struct Func
 	pc0    uintptr // starting pc, ln for table
 	ln0    int32
 	frame  int32 // stack frame size
-	args   int32 // number of 32-bit in/out args
-	locals int32 // number of 32-bit locals
+	args   int32 // in/out args size
+	locals int32 // locals size
 }
 
 // FuncForPC returns a *Func describing the function that contains the
@@ -67,7 +67,7 @@ func (f *Func) FileLine(pc uintptr) (file string, line int) {
 // implemented in symtab.c
 func funcline_go(*Func, uintptr) (string, int)
 
-// mid returns the current os thread (m) id.
+// mid returns the current OS thread (m) id.
 func mid() uint32
 
 // SetFinalizer sets the finalizer associated with x to f.
diff --git a/src/pkg/runtime/float.c b/src/pkg/runtime/float.c
index f481519f6..42082e434 100644
--- a/src/pkg/runtime/float.c
+++ b/src/pkg/runtime/float.c
@@ -4,170 +4,7 @@
 
 #include "runtime.h"
 
-static	uint64	uvnan		= 0x7FF0000000000001ULL;
-static	uint64	uvinf		= 0x7FF0000000000000ULL;
-static	uint64	uvneginf	= 0xFFF0000000000000ULL;
-
-uint32
-runtime·float32tobits(float32 f)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float32 f;
-		uint32 i;
-	} u;
-	u.f = f;
-	return u.i;
-}
-
-uint64
-runtime·float64tobits(float64 f)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float64 f;
-		uint64 i;
-	} u;
-	u.f = f;
-	return u.i;
-}
-
-float64
-runtime·float64frombits(uint64 i)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float64 f;
-		uint64 i;
-	} u;
-	u.i = i;
-	return u.f;
-}
-
-float32
-runtime·float32frombits(uint32 i)
-{
-	// The obvious cast-and-pointer code is technically
-	// not valid, and gcc miscompiles it.  Use a union instead.
-	union {
-		float32 f;
-		uint32 i;
-	} u;
-	u.i = i;
-	return u.f;
-}
-
-bool
-runtime·isInf(float64 f, int32 sign)
-{
-	uint64 x;
-
-	x = runtime·float64tobits(f);
-	if(sign == 0)
-		return x == uvinf || x == uvneginf;
-	if(sign > 0)
-		return x == uvinf;
-	return x == uvneginf;
-}
-
-float64
-runtime·NaN(void)
-{
-	return runtime·float64frombits(uvnan);
-}
-
-bool
-runtime·isNaN(float64 f)
-{
-	uint64 x;
-
-	x = runtime·float64tobits(f);
-	return ((uint32)(x>>52) & 0x7FF) == 0x7FF && !runtime·isInf(f, 0);
-}
-
-float64
-runtime·Inf(int32 sign)
-{
-	if(sign >= 0)
-		return runtime·float64frombits(uvinf);
-	else
-		return runtime·float64frombits(uvneginf);
-}
-
-enum
-{
-	MASK	= 0x7ffL,
-	SHIFT	= 64-11-1,
-	BIAS	= 1022L,
-};
-
-float64
-runtime·frexp(float64 d, int32 *ep)
-{
-	uint64 x;
-
-	if(d == 0) {
-		*ep = 0;
-		return 0;
-	}
-	x = runtime·float64tobits(d);
-	*ep = (int32)((x >> SHIFT) & MASK) - BIAS;
-	x &= ~((uint64)MASK << SHIFT);
-	x |= (uint64)BIAS << SHIFT;
-	return runtime·float64frombits(x);
-}
-
-float64
-runtime·ldexp(float64 d, int32 e)
-{
-	uint64 x;
-
-	if(d == 0)
-		return 0;
-	x = runtime·float64tobits(d);
-	e += (int32)(x >> SHIFT) & MASK;
-	if(e <= 0)
-		return 0;	/* underflow */
-	if(e >= MASK){		/* overflow */
-		if(d < 0)
-			return runtime·Inf(-1);
-		return runtime·Inf(1);
-	}
-	x &= ~((uint64)MASK << SHIFT);
-	x |= (uint64)e << SHIFT;
-	return runtime·float64frombits(x);
-}
-
-float64
-runtime·modf(float64 d, float64 *ip)
-{
-	float64 dd;
-	uint64 x;
-	int32 e;
-
-	if(d < 1) {
-		if(d < 0) {
-			d = runtime·modf(-d, ip);
-			*ip = -*ip;
-			return -d;
-		}
-		*ip = 0;
-		return d;
-	}
-
-	x = runtime·float64tobits(d);
-	e = (int32)((x >> SHIFT) & MASK) - BIAS;
-
-	/*
-	 * Keep the top 11+e bits; clear the rest.
-	 */
-	if(e <= 64-11)
-		x &= ~(((uint64)1 << (64LL-11LL-e))-1);
-	dd = runtime·float64frombits(x);
-	*ip = dd;
-	return d - dd;
-}
-
+// used as float64 via runtime· names
+uint64	·nan		= 0x7FF8000000000001ULL;
+uint64	·posinf	= 0x7FF0000000000000ULL;
+uint64	·neginf	= 0xFFF0000000000000ULL;
diff --git a/src/pkg/runtime/gc_test.go b/src/pkg/runtime/gc_test.go
index 65894a6fd..e1e1b1d01 100644
--- a/src/pkg/runtime/gc_test.go
+++ b/src/pkg/runtime/gc_test.go
@@ -5,37 +5,80 @@
 package runtime_test
 
 import (
+	"os"
 	"runtime"
 	"testing"
 )
 
 func TestGcSys(t *testing.T) {
+	if os.Getenv("GOGC") == "off" {
+		t.Fatalf("GOGC=off in environment; test cannot pass")
+	}
+	data := struct{ Short bool }{testing.Short()}
+	got := executeTest(t, testGCSysSource, &data)
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
+const testGCSysSource = `
+package main
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func main() {
+	runtime.GOMAXPROCS(1)
 	memstats := new(runtime.MemStats)
 	runtime.GC()
 	runtime.ReadMemStats(memstats)
 	sys := memstats.Sys
 
+	runtime.MemProfileRate = 0 // disable profiler
+
 	itercount := 1000000
-	if testing.Short() {
-		itercount = 100000
-	}
+{{if .Short}}
+	itercount = 100000
+{{end}}
 	for i := 0; i < itercount; i++ {
 		workthegc()
 	}
 
 	// Should only be using a few MB.
+	// We allocated 100 MB or (if not short) 1 GB.
 	runtime.ReadMemStats(memstats)
 	if sys > memstats.Sys {
 		sys = 0
 	} else {
 		sys = memstats.Sys - sys
 	}
-	t.Logf("used %d extra bytes", sys)
-	if sys > 4<<20 {
-		t.Fatalf("using too much memory: %d bytes", sys)
+	if sys > 16<<20 {
+		fmt.Printf("using too much memory: %d bytes\n", sys)
+		return
 	}
+	fmt.Printf("OK\n")
 }
 
 func workthegc() []byte {
 	return make([]byte, 1029)
 }
+`
+
+func TestGcDeepNesting(t *testing.T) {
+	type T [2][2][2][2][2][2][2][2][2][2]*int
+	a := new(T)
+
+	// Prevent the compiler from applying escape analysis.
+	// This makes sure new(T) is allocated on heap, not on the stack.
+	t.Logf("%p", a)
+
+	a[0][0][0][0][0][0][0][0][0][0] = new(int)
+	*a[0][0][0][0][0][0][0][0][0][0] = 13
+	runtime.GC()
+	if *a[0][0][0][0][0][0][0][0][0][0] != 13 {
+		t.Fail()
+	}
+}
diff --git a/src/pkg/runtime/hashmap.c b/src/pkg/runtime/hashmap.c
index 63ed4e2a3..37111daa9 100644
--- a/src/pkg/runtime/hashmap.c
+++ b/src/pkg/runtime/hashmap.c
@@ -3,8 +3,11 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
 #include "hashmap.h"
 #include "type.h"
+#include "race.h"
 
 /* Hmap flag values */
 #define IndirectVal  (1<<0)	/* storing pointers to values */
@@ -13,7 +16,7 @@
 #define CanFreeKey (1<<3)	/* okay to free pointers to keys */
 
 struct Hmap {	   /* a hash table; initialize with hash_init() */
-	uint32 count;	  /* elements in table - must be first */
+	uintgo count;	  /* elements in table - must be first */
 	uint8 datasize;   /* amount of data to store in entry */
 	uint8 flag;
 	uint8 valoff;	/* offset of value in key+value data block */
@@ -78,7 +81,7 @@ hash_subtable_new (Hmap *h, int32 power, int32 used)
 		max_probes = 1 << power;
 	}
 	bytes += limit_bytes - elemsize;
-	st = malloc (offsetof (struct hash_subtable, entry[0]) + bytes);
+	st = runtime·mallocgc(offsetof (struct hash_subtable, entry[0]) + bytes, UseSpanType ? FlagNoPointers : 0, 1, 1);
 	st->power = power;
 	st->used = used;
 	st->datasize = h->datasize;
@@ -115,7 +118,7 @@ hash_init (Hmap *h, int32 datasize, int64 hint)
 
 	if(datasize < sizeof (void *))
 		datasize = sizeof (void *);
-	datasize = runtime·rnd(datasize, sizeof (void *));
+	datasize = ROUND(datasize, sizeof (void *));
 	init_sizes (hint, &init_power);
 	h->datasize = datasize;
 	assert (h->datasize == datasize);
@@ -273,7 +276,7 @@ hash_lookup (MapType *t, Hmap *h, void *data, void **pres)
 	struct hash_entry *end_e;
 	void *key;
 	bool eq;
-	
+
 	hash = h->hash0;
 	(*t->key->alg->hash) (&hash, t->key->size, data);
 	hash &= ~HASH_MASK;
@@ -416,8 +419,12 @@ hash_insert_internal (MapType *t, struct hash_subtable **pst, int32 flags, hash_
 					*pres = ins_e->data;
 					return (1);
 				}
-				assert (e_hash != hash || (flags & HASH_REHASH) == 0);
-				hash += (e_hash == hash);	   /* adjust hash if it collides */
+				if (e_hash == hash) {	   /* adjust hash if it collides */
+					assert ((flags & HASH_REHASH) == 0);
+					hash++;
+					if ((hash & HASH_MASK) == HASH_SUBHASH)
+						runtime·throw("runtime: map hash collision overflow");
+				}
 				ins_e = HASH_OFFSET (ins_e, elemsize);
 				ins_i++;
 				if (e_hash <= hash) {	       /* set e to insertion point */
@@ -462,7 +469,7 @@ hash_insert (MapType *t, Hmap *h, void *data, void **pres)
 {
 	uintptr hash;
 	int32 rc;
-	
+
 	hash = h->hash0;
 	(*t->key->alg->hash) (&hash, t->key->size, data);
 	rc = hash_insert_internal (t, &h->st, 0, hash, h, data, pres);
@@ -618,7 +625,7 @@ hash_iter_init (MapType *t, Hmap *h, struct hash_iter *it)
 	it->subtable_state[0].e = h->st->entry;
 	it->subtable_state[0].start = h->st->entry;
 	it->subtable_state[0].last = h->st->last;
-	
+
 	// fastrand1 returns 31 useful bits.
 	// We don't care about not having a bottom bit but we
 	// do want top bits.
@@ -700,6 +707,82 @@ hash_visit (Hmap *h, void (*data_visit) (void *arg, int32 level, void *data), vo
 	hash_visit_internal (h->st, 0, 0, data_visit, arg);
 }
 
+// Initialize the iterator.
+// Returns false if Hmap contains no pointers (in which case the iterator is not initialized).
+bool
+hash_gciter_init (Hmap *h, struct hash_gciter *it)
+{
+	// GC during map initialization
+	if(h->st == nil)
+		return false;
+
+	it->elemsize = h->datasize + offsetof (struct hash_entry, data[0]);
+	it->flag = h->flag;
+	it->valoff = h->valoff;
+	it->i = 0;
+	it->st = h->st;
+	it->subtable_state[it->i].e = h->st->entry;
+	it->subtable_state[it->i].last = h->st->last;
+	return true;
+}
+
+// Returns true and fills *data with subtable/key/value data,
+// or returns false if the iterator has terminated.
+bool
+hash_gciter_next (struct hash_gciter *it, struct hash_gciter_data *data)
+{
+	struct hash_entry *e;
+	struct hash_gciter_sub *sub;
+
+	data->st = nil;
+	data->key_data = nil;
+	data->val_data = nil;
+
+	// pointer to the first-level table
+	if(it->st != nil) {
+		data->st = it->st;
+		it->st = nil;
+		return true;
+	}
+
+popped:
+	sub = &it->subtable_state[it->i];
+	e = sub->e;
+	while (e <= sub->last) {
+		if ((e->hash & HASH_MASK) == HASH_SUBHASH) {
+			struct hash_subtable *st = *(struct hash_subtable **)e->data;
+			data->st = st;
+			sub->e = HASH_OFFSET (e, it->elemsize);
+
+			// push
+			it->i++;
+			assert (it->i < nelem(it->subtable_state));
+			sub++;
+			sub->e = st->entry;
+			sub->last = st->last;
+
+			return true;
+		}
+		if(e->hash != HASH_NIL) {
+			void *key_data = e->data;
+			void *val_data = (byte*)e->data + it->valoff;
+			data->key_data = key_data;
+			data->val_data = val_data;
+			data->indirectkey = (it->flag & IndirectKey) != 0;
+			data->indirectval = (it->flag & IndirectVal) != 0;
+			sub->e = HASH_OFFSET (e, it->elemsize);
+			return true;
+		}
+		e = HASH_OFFSET (e, it->elemsize);
+	}
+	if(it->i != 0) {
+		// pop
+		it->i--;
+		goto popped;
+	}
+	return false;
+}
+
 //
 /// interfaces to go runtime
 //
@@ -724,14 +807,13 @@ hash_keyptr(Hmap *h, void *p)
 
 static	int32	debug	= 0;
 
-// makemap(typ *Type, hint uint32) (hmap *map[any]any);
 Hmap*
 runtime·makemap_c(MapType *typ, int64 hint)
 {
 	Hmap *h;
 	Type *key, *val;
 	uintptr ksize, vsize;
-	
+
 	key = typ->key;
 	val = typ->elem;
 
@@ -744,8 +826,15 @@ runtime·makemap_c(MapType *typ, int64 hint)
 	h = runtime·mal(sizeof(*h));
 	h->flag |= CanFreeTable;  /* until reflect gets involved, free is okay */
 
-	ksize = runtime·rnd(key->size, sizeof(void*));
-	vsize = runtime·rnd(val->size, sizeof(void*));
+	if(UseSpanType) {
+		if(false) {
+			runtime·printf("makemap %S: %p\n", *typ->string, h);
+		}
+		runtime·settype(h, (uintptr)typ | TypeInfo_Map);
+	}
+
+	ksize = ROUND(key->size, sizeof(void*));
+	vsize = ROUND(val->size, sizeof(void*));
 	if(ksize > MaxData || vsize > MaxData || ksize+vsize > MaxData) {
 		// Either key is too big, or value is, or combined they are.
 		// Prefer to keep the key if possible, because we look at
@@ -828,8 +917,11 @@ runtime·mapaccess1(MapType *t, Hmap *h, ...)
 	byte *ak, *av;
 	bool pres;
 
+	if(raceenabled && h != nil)
+		runtime·racereadpc(h, runtime·getcallerpc(&t), runtime·mapaccess1);
+
 	ak = (byte*)(&h + 1);
-	av = ak + runtime·rnd(t->key->size, Structrnd);
+	av = ak + ROUND(t->key->size, Structrnd);
 
 	runtime·mapaccess(t, h, ak, av, &pres);
 
@@ -853,8 +945,11 @@ runtime·mapaccess2(MapType *t, Hmap *h, ...)
 {
 	byte *ak, *av, *ap;
 
+	if(raceenabled && h != nil)
+		runtime·racereadpc(h, runtime·getcallerpc(&t), runtime·mapaccess2);
+
 	ak = (byte*)(&h + 1);
-	av = ak + runtime·rnd(t->key->size, Structrnd);
+	av = ak + ROUND(t->key->size, Structrnd);
 	ap = av + t->elem->size;
 
 	runtime·mapaccess(t, h, ak, av, ap);
@@ -881,6 +976,9 @@ reflect·mapaccess(MapType *t, Hmap *h, uintptr key, uintptr val, bool pres)
 {
 	byte *ak, *av;
 
+	if(raceenabled && h != nil)
+		runtime·racereadpc(h, runtime·getcallerpc(&t), reflect·mapaccess);
+
 	if(t->key->size <= sizeof(key))
 		ak = (byte*)&key;
 	else
@@ -951,8 +1049,10 @@ runtime·mapassign1(MapType *t, Hmap *h, ...)
 	if(h == nil)
 		runtime·panicstring("assignment to entry in nil map");
 
+	if(raceenabled)
+		runtime·racewritepc(h, runtime·getcallerpc(&t), runtime·mapassign1);
 	ak = (byte*)(&h + 1);
-	av = ak + runtime·rnd(t->key->size, t->elem->align);
+	av = ak + ROUND(t->key->size, t->elem->align);
 
 	runtime·mapassign(t, h, ak, av);
 }
@@ -965,8 +1065,10 @@ runtime·mapdelete(MapType *t, Hmap *h, ...)
 	byte *ak;
 
 	if(h == nil)
-		runtime·panicstring("deletion of entry in nil map");
+		return;
 
+	if(raceenabled)
+		runtime·racewritepc(h, runtime·getcallerpc(&t), runtime·mapdelete);
 	ak = (byte*)(&h + 1);
 	runtime·mapassign(t, h, ak, nil);
 
@@ -990,6 +1092,8 @@ reflect·mapassign(MapType *t, Hmap *h, uintptr key, uintptr val, bool pres)
 
 	if(h == nil)
 		runtime·panicstring("assignment to entry in nil map");
+	if(raceenabled)
+		runtime·racewritepc(h, runtime·getcallerpc(&t), reflect·mapassign);
 	if(t->key->size <= sizeof(key))
 		ak = (byte*)&key;
 	else
@@ -1011,6 +1115,8 @@ runtime·mapiterinit(MapType *t, Hmap *h, struct hash_iter *it)
 		it->data = nil;
 		return;
 	}
+	if(raceenabled)
+		runtime·racereadpc(h, runtime·getcallerpc(&t), runtime·mapiterinit);
 	hash_iter_init(t, h, it);
 	it->data = hash_next(it);
 	if(debug) {
@@ -1054,6 +1160,8 @@ reflect·mapiterinit(MapType *t, Hmap *h, struct hash_iter *it)
 void
 runtime·mapiternext(struct hash_iter *it)
 {
+	if(raceenabled)
+		runtime·racereadpc(it->h, runtime·getcallerpc(&it), runtime·mapiternext);
 	if(runtime·gcwaiting)
 		runtime·gosched();
 
@@ -1148,15 +1256,18 @@ reflect·mapiterkey(struct hash_iter *it, uintptr key, bool ok)
 }
 
 // For reflect:
-//	func maplen(h map) (len int32)
+//	func maplen(h map) (len int)
 // Like len(m) in the actual language, we treat the nil map as length 0.
 void
-reflect·maplen(Hmap *h, int32 len)
+reflect·maplen(Hmap *h, intgo len)
 {
 	if(h == nil)
 		len = 0;
-	else
+	else {
 		len = h->count;
+		if(raceenabled)
+			runtime·racereadpc(h, runtime·getcallerpc(&h), reflect·maplen);
+	}
 	FLUSH(&len);
 }
 
@@ -1171,7 +1282,7 @@ runtime·mapiter2(struct hash_iter *it, ...)
 
 	t = it->t;
 	ak = (byte*)(&it + 1);
-	av = ak + runtime·rnd(t->key->size, t->elem->align);
+	av = ak + ROUND(t->key->size, t->elem->align);
 
 	res = it->data;
 	if(res == nil)
diff --git a/src/pkg/runtime/hashmap.h b/src/pkg/runtime/hashmap.h
index 4c10cf6ef..9b82f299e 100644
--- a/src/pkg/runtime/hashmap.h
+++ b/src/pkg/runtime/hashmap.h
@@ -63,7 +63,6 @@
 	}
  */
 
-#define	malloc		runtime·mal
 #define	memset(a,b,c)	runtime·memclr((byte*)(a), (uint32)(c))
 #define	memcpy(a,b,c)	runtime·memmove((byte*)(a),(byte*)(b),(uint32)(c))
 #define	assert(a)	if(!(a)) runtime·throw("hashmap assert")
@@ -143,7 +142,7 @@ struct hash_iter {
    Remove all sub-tables associated with *h.
    This undoes the effects of hash_init().
    If other memory pointed to by user data must be freed, the caller is
-   responsible for doiing do by iterating over *h first; see
+   responsible for doing so by iterating over *h first; see
    hash_iter_init()/hash_next().  */
 // void hash_destroy (struct hash *h);
 
@@ -152,7 +151,7 @@ struct hash_iter {
 /* Initialize *it from *h. */
 // void hash_iter_init (struct hash *h, struct hash_iter *it);
 
-/* Return the next used entry in the table which which *it was initialized. */
+/* Return the next used entry in the table with which *it was initialized. */
 // void *hash_next (struct hash_iter *it);
 
 /*---- test interface ----*/
@@ -160,3 +159,27 @@ struct hash_iter {
    whether used or not.   "level" is the subtable level, 0 means first level. */
 /* TESTING ONLY: DO NOT USE THIS ROUTINE IN NORMAL CODE */
 // void hash_visit (struct hash *h, void (*data_visit) (void *arg, int32 level, void *data), void *arg);
+
+/* Used by the garbage collector */
+struct hash_gciter
+{
+	int32	elemsize;
+	uint8	flag;
+	uint8	valoff;
+	uint32	i;		/* stack pointer in subtable_state */
+	struct hash_subtable *st;
+	struct hash_gciter_sub {
+		struct hash_entry *e;		/* pointer into subtable */
+		struct hash_entry *last;	/* last entry in subtable */
+	} subtable_state[4];
+};
+struct hash_gciter_data
+{
+	struct hash_subtable *st;	/* subtable pointer, or nil */
+	uint8 *key_data;		/* key data, or nil */
+	uint8 *val_data;		/* value data, or nil */
+	bool indirectkey;		/* storing pointers to keys */
+	bool indirectval;		/* storing pointers to values */
+};
+bool hash_gciter_init (struct Hmap *h, struct hash_gciter *it);
+bool hash_gciter_next (struct hash_gciter *it, struct hash_gciter_data *data);
diff --git a/src/pkg/runtime/iface.c b/src/pkg/runtime/iface.c
index 2b60c4f23..370edffb8 100644
--- a/src/pkg/runtime/iface.c
+++ b/src/pkg/runtime/iface.c
@@ -5,6 +5,7 @@
 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "type.h"
+#include "typekind.h"
 #include "malloc.h"
 
 void
@@ -19,19 +20,6 @@ runtime·printeface(Eface e)
 	runtime·printf("(%p,%p)", e.type, e.data);
 }
 
-/*
- * layout of Itab known to compilers
- */
-struct Itab
-{
-	InterfaceType*	inter;
-	Type*	type;
-	Itab*	link;
-	int32	bad;
-	int32	unused;
-	void	(*fun[])(void);
-};
-
 static	Itab*	hash[1009];
 static	Lock	ifacelock;
 
@@ -182,19 +170,37 @@ copyout(Type *t, void **src, void *dst)
 		alg->copy(size, dst, *src);
 }
 
-// func convT2I(typ *byte, typ2 *byte, elem any) (ret any)
 #pragma textflag 7
 void
-runtime·convT2I(Type *t, InterfaceType *inter, ...)
+runtime·typ2Itab(Type *t, InterfaceType *inter, Itab **cache, Itab *ret)
+{
+	Itab *tab;
+
+	tab = itab(inter, t, 0);
+	runtime·atomicstorep(cache, tab);
+	ret = tab;
+	FLUSH(&ret);
+}
+
+// func convT2I(typ *byte, typ2 *byte, cache **byte, elem any) (ret any)
+#pragma textflag 7
+void
+runtime·convT2I(Type *t, InterfaceType *inter, Itab **cache, ...)
 {
 	byte *elem;
 	Iface *ret;
+	Itab *tab;
 	int32 wid;
 
-	elem = (byte*)(&inter+1);
+	elem = (byte*)(&cache+1);
 	wid = t->size;
-	ret = (Iface*)(elem + runtime·rnd(wid, Structrnd));
-	ret->tab = itab(inter, t, 0);
+	ret = (Iface*)(elem + ROUND(wid, Structrnd));
+	tab = runtime·atomicloadp(cache);
+	if(!tab) {
+		tab = itab(inter, t, 0);
+		runtime·atomicstorep(cache, tab);
+	}
+	ret->tab = tab;
 	copyin(t, elem, &ret->data);
 }
 
@@ -209,7 +215,7 @@ runtime·convT2E(Type *t, ...)
 
 	elem = (byte*)(&t+1);
 	wid = t->size;
-	ret = (Eface*)(elem + runtime·rnd(wid, Structrnd));
+	ret = (Eface*)(elem + ROUND(wid, Structrnd));
 	ret->type = t;
 	copyin(t, elem, &ret->data);
 }
@@ -272,6 +278,13 @@ runtime·assertI2T2(Type *t, Iface i, ...)
 	copyout(t, &i.data, ret);
 }
 
+void
+runtime·assertI2TOK(Type *t, Iface i, bool ok)
+{
+	ok = i.tab!=nil && i.tab->type==t;
+	FLUSH(&ok);
+}
+
 static void assertE2Tret(Type *t, Eface e, byte *ret);
 
 // func ifaceE2T(typ *byte, iface any) (ret any)
@@ -328,6 +341,13 @@ runtime·assertE2T2(Type *t, Eface e, ...)
 	copyout(t, &e.data, ret);
 }
 
+void
+runtime·assertE2TOK(Type *t, Eface e, bool ok)
+{
+	ok = t==e.type;
+	FLUSH(&ok);
+}
+
 // func convI2E(elem any) (ret any)
 void
 runtime·convI2E(Iface i, Eface ret)
@@ -387,7 +407,7 @@ void
 runtime·convI2I(InterfaceType* inter, Iface i, Iface ret)
 {
 	Itab *tab;
-	
+
 	ret.data = i.data;
 	if((tab = i.tab) == nil)
 		ret.tab = nil;
@@ -526,10 +546,10 @@ runtime·assertE2E2(InterfaceType* inter, Eface e, Eface ret, bool ok)
 }
 
 static uintptr
-ifacehash1(void *data, Type *t)
+ifacehash1(void *data, Type *t, uintptr h)
 {
 	Alg *alg;
-	uintptr size, h;
+	uintptr size;
 	Eface err;
 
 	if(t == nil)
@@ -543,7 +563,6 @@ ifacehash1(void *data, Type *t)
 		runtime·newErrorString(runtime·catstring(runtime·gostringnocopy((byte*)"hash of unhashable type "), *t->string), &err);
 		runtime·panic(err);
 	}
-	h = 0;
 	if(size <= sizeof(data))
 		alg->hash(&h, size, &data);
 	else
@@ -552,17 +571,17 @@ ifacehash1(void *data, Type *t)
 }
 
 uintptr
-runtime·ifacehash(Iface a)
+runtime·ifacehash(Iface a, uintptr h)
 {
 	if(a.tab == nil)
-		return 0;
-	return ifacehash1(a.data, a.tab->type);
+		return h;
+	return ifacehash1(a.data, a.tab->type, h);
 }
 
 uintptr
-runtime·efacehash(Eface a)
+runtime·efacehash(Eface a, uintptr h)
 {
-	return ifacehash1(a.data, a.type);
+	return ifacehash1(a.data, a.type, h);
 }
 
 static bool
@@ -666,39 +685,54 @@ reflect·unsafe_Typeof(Eface e, Eface ret)
 }
 
 void
-reflect·unsafe_New(Eface typ, void *ret)
+reflect·unsafe_New(Type *t, void *ret)
 {
-	Type *t;
+	uint32 flag;
 
-	// Reflect library has reinterpreted typ
-	// as its own kind of type structure.
-	// We know that the pointer to the original
-	// type structure sits before the data pointer.
-	t = (Type*)((Eface*)typ.data-1);
+	flag = t->kind&KindNoPointers ? FlagNoPointers : 0;
+	ret = runtime·mallocgc(t->size, flag, 1, 1);
+
+	if(UseSpanType && !flag) {
+		if(false) {
+			runtime·printf("unsafe_New %S: %p\n", *t->string, ret);
+		}
+		runtime·settype(ret, (uintptr)t | TypeInfo_SingleObject);
+	}
 
-	if(t->kind&KindNoPointers)
-		ret = runtime·mallocgc(t->size, FlagNoPointers, 1, 1);
-	else
-		ret = runtime·mal(t->size);
 	FLUSH(&ret);
 }
 
 void
-reflect·unsafe_NewArray(Eface typ, uint32 n, void *ret)
+reflect·unsafe_NewArray(Type *t, intgo n, void *ret)
 {
 	uint64 size;
-	Type *t;
 
-	// Reflect library has reinterpreted typ
-	// as its own kind of type structure.
-	// We know that the pointer to the original
-	// type structure sits before the data pointer.
-	t = (Type*)((Eface*)typ.data-1);
-	
 	size = n*t->size;
-	if(t->kind&KindNoPointers)
+	if(size == 0)
+		ret = (byte*)&runtime·zerobase;
+	else if(t->kind&KindNoPointers)
 		ret = runtime·mallocgc(size, FlagNoPointers, 1, 1);
-	else
-		ret = runtime·mal(size);
+	else {
+		ret = runtime·mallocgc(size, 0, 1, 1);
+
+		if(UseSpanType) {
+			if(false) {
+				runtime·printf("unsafe_NewArray [%D]%S: %p\n", (int64)n, *t->string, ret);
+			}
+			runtime·settype(ret, (uintptr)t | TypeInfo_Array);
+		}
+	}
+
+	FLUSH(&ret);
+}
+
+void
+reflect·typelinks(Slice ret)
+{
+	extern Type *typelink[], *etypelink[];
+	static int32 first = 1;
+	ret.array = (byte*)typelink;
+	ret.len = etypelink - typelink;
+	ret.cap = ret.len;
 	FLUSH(&ret);
 }
diff --git a/src/pkg/runtime/iface_test.go b/src/pkg/runtime/iface_test.go
new file mode 100644
index 000000000..bca0ea0ee
--- /dev/null
+++ b/src/pkg/runtime/iface_test.go
@@ -0,0 +1,138 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"testing"
+)
+
+type I1 interface {
+	Method1()
+}
+
+type I2 interface {
+	Method1()
+	Method2()
+}
+
+type TS uint16
+type TM uintptr
+type TL [2]uintptr
+
+func (TS) Method1() {}
+func (TS) Method2() {}
+func (TM) Method1() {}
+func (TM) Method2() {}
+func (TL) Method1() {}
+func (TL) Method2() {}
+
+var (
+	e  interface{}
+	e_ interface{}
+	i1 I1
+	i2 I2
+	ts TS
+	tm TM
+	tl TL
+)
+
+func BenchmarkConvT2ESmall(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		e = ts
+	}
+}
+
+func BenchmarkConvT2EUintptr(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		e = tm
+	}
+}
+
+func BenchmarkConvT2ELarge(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		e = tl
+	}
+}
+
+func BenchmarkConvT2ISmall(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i1 = ts
+	}
+}
+
+func BenchmarkConvT2IUintptr(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i1 = tm
+	}
+}
+
+func BenchmarkConvT2ILarge(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		i1 = tl
+	}
+}
+
+func BenchmarkConvI2E(b *testing.B) {
+	i2 = tm
+	for i := 0; i < b.N; i++ {
+		e = i2
+	}
+}
+
+func BenchmarkConvI2I(b *testing.B) {
+	i2 = tm
+	for i := 0; i < b.N; i++ {
+		i1 = i2
+	}
+}
+
+func BenchmarkAssertE2T(b *testing.B) {
+	e = tm
+	for i := 0; i < b.N; i++ {
+		tm = e.(TM)
+	}
+}
+
+func BenchmarkAssertE2TLarge(b *testing.B) {
+	e = tl
+	for i := 0; i < b.N; i++ {
+		tl = e.(TL)
+	}
+}
+
+func BenchmarkAssertE2I(b *testing.B) {
+	e = tm
+	for i := 0; i < b.N; i++ {
+		i1 = e.(I1)
+	}
+}
+
+func BenchmarkAssertI2T(b *testing.B) {
+	i1 = tm
+	for i := 0; i < b.N; i++ {
+		tm = i1.(TM)
+	}
+}
+
+func BenchmarkAssertI2I(b *testing.B) {
+	i1 = tm
+	for i := 0; i < b.N; i++ {
+		i2 = i1.(I2)
+	}
+}
+
+func BenchmarkAssertI2E(b *testing.B) {
+	i1 = tm
+	for i := 0; i < b.N; i++ {
+		e = i1.(interface{})
+	}
+}
+
+func BenchmarkAssertE2E(b *testing.B) {
+	e = tm
+	for i := 0; i < b.N; i++ {
+		e_ = e
+	}
+}
diff --git a/src/pkg/runtime/lfstack.c b/src/pkg/runtime/lfstack.c
new file mode 100644
index 000000000..1d48491aa
--- /dev/null
+++ b/src/pkg/runtime/lfstack.c
@@ -0,0 +1,65 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Lock-free stack.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+
+#ifdef _64BIT
+// Amd64 uses 48-bit virtual addresses, 47-th bit is used as kernel/user flag.
+// So we use 17msb of pointers as ABA counter.
+# define PTR_BITS 47
+#else
+# define PTR_BITS 32
+#endif
+#define PTR_MASK ((1ull<<PTR_BITS)-1)
+#define CNT_MASK (0ull-1)
+
+void
+runtime·lfstackpush(uint64 *head, LFNode *node)
+{
+	uint64 old, new;
+
+	if((uintptr)node != ((uintptr)node&PTR_MASK)) {
+		runtime·printf("p=%p\n", node);
+		runtime·throw("runtime·lfstackpush: invalid pointer");
+	}
+
+	node->pushcnt++;
+	new = (uint64)(uintptr)node|(((uint64)node->pushcnt&CNT_MASK)<<PTR_BITS);
+	old = runtime·atomicload64(head);
+	for(;;) {
+		node->next = (LFNode*)(uintptr)(old&PTR_MASK);
+		if(runtime·cas64(head, &old, new))
+			break;
+	}
+}
+
+LFNode*
+runtime·lfstackpop(uint64 *head)
+{
+	LFNode *node, *node2;
+	uint64 old, new;
+
+	old = runtime·atomicload64(head);
+	for(;;) {
+		if(old == 0)
+			return nil;
+		node = (LFNode*)(uintptr)(old&PTR_MASK);
+		node2 = runtime·atomicloadp(&node->next);
+		new = 0;
+		if(node2 != nil)
+			new = (uint64)(uintptr)node2|(((uint64)node2->pushcnt&CNT_MASK)<<PTR_BITS);
+		if(runtime·cas64(head, &old, new))
+			return node;
+	}
+}
+
+void
+runtime·lfstackpop2(uint64 *head, LFNode *node)
+{
+	node = runtime·lfstackpop(head);
+	FLUSH(&node);
+}
diff --git a/src/pkg/runtime/lfstack_test.go b/src/pkg/runtime/lfstack_test.go
new file mode 100644
index 000000000..505aae605
--- /dev/null
+++ b/src/pkg/runtime/lfstack_test.go
@@ -0,0 +1,130 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math/rand"
+	. "runtime"
+	"testing"
+	"unsafe"
+)
+
+type MyNode struct {
+	LFNode
+	data int
+}
+
+func fromMyNode(node *MyNode) *LFNode {
+	return (*LFNode)(unsafe.Pointer(node))
+}
+
+func toMyNode(node *LFNode) *MyNode {
+	return (*MyNode)(unsafe.Pointer(node))
+}
+
+func TestLFStack(t *testing.T) {
+	stack := new(uint64)
+	// Need to keep additional referenfces to nodes, the stack is not all that type-safe.
+	var nodes []*MyNode
+
+	// Check the stack is initially empty.
+	if LFStackPop(stack) != nil {
+		t.Fatalf("stack is not empty")
+	}
+
+	// Push one element.
+	node := &MyNode{data: 42}
+	nodes = append(nodes, node)
+	LFStackPush(stack, fromMyNode(node))
+
+	// Push another.
+	node = &MyNode{data: 43}
+	nodes = append(nodes, node)
+	LFStackPush(stack, fromMyNode(node))
+
+	// Pop one element.
+	node = toMyNode(LFStackPop(stack))
+	if node == nil {
+		t.Fatalf("stack is empty")
+	}
+	if node.data != 43 {
+		t.Fatalf("no lifo")
+	}
+
+	// Pop another.
+	node = toMyNode(LFStackPop(stack))
+	if node == nil {
+		t.Fatalf("stack is empty")
+	}
+	if node.data != 42 {
+		t.Fatalf("no lifo")
+	}
+
+	// Check the stack is empty again.
+	if LFStackPop(stack) != nil {
+		t.Fatalf("stack is not empty")
+	}
+	if *stack != 0 {
+		t.Fatalf("stack is not empty")
+	}
+}
+
+func TestLFStackStress(t *testing.T) {
+	const K = 100
+	P := 4 * GOMAXPROCS(-1)
+	N := 100000
+	if testing.Short() {
+		N /= 10
+	}
+	// Create 2 stacks.
+	stacks := [2]*uint64{new(uint64), new(uint64)}
+	// Need to keep additional referenfces to nodes, the stack is not all that type-safe.
+	var nodes []*MyNode
+	// Push K elements randomly onto the stacks.
+	sum := 0
+	for i := 0; i < K; i++ {
+		sum += i
+		node := &MyNode{data: i}
+		nodes = append(nodes, node)
+		LFStackPush(stacks[i%2], fromMyNode(node))
+	}
+	c := make(chan bool, P)
+	for p := 0; p < P; p++ {
+		go func() {
+			r := rand.New(rand.NewSource(rand.Int63()))
+			// Pop a node from a random stack, then push it onto a random stack.
+			for i := 0; i < N; i++ {
+				node := toMyNode(LFStackPop(stacks[r.Intn(2)]))
+				if node != nil {
+					LFStackPush(stacks[r.Intn(2)], fromMyNode(node))
+				}
+			}
+			c <- true
+		}()
+	}
+	for i := 0; i < P; i++ {
+		<-c
+	}
+	// Pop all elements from both stacks, and verify that nothing lost.
+	sum2 := 0
+	cnt := 0
+	for i := 0; i < 2; i++ {
+		for {
+			node := toMyNode(LFStackPop(stacks[i]))
+			if node == nil {
+				break
+			}
+			cnt++
+			sum2 += node.data
+			node.Next = nil
+		}
+	}
+	if cnt != K {
+		t.Fatalf("Wrong number of nodes %d/%d", cnt, K)
+	}
+	if sum2 != sum {
+		t.Fatalf("Wrong sum %d/%d", sum2, sum)
+	}
+}
diff --git a/src/pkg/runtime/lock_futex.c b/src/pkg/runtime/lock_futex.c
index b4465bff1..9b1f5f6db 100644
--- a/src/pkg/runtime/lock_futex.c
+++ b/src/pkg/runtime/lock_futex.c
@@ -111,7 +111,8 @@ runtime·noteclear(Note *n)
 void
 runtime·notewakeup(Note *n)
 {
-	runtime·xchg(&n->key, 1);
+	if(runtime·xchg(&n->key, 1))
+		runtime·throw("notewakeup - double wakeup");
 	runtime·futexwakeup(&n->key, 1);
 }
 
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index 9ae3a9d61..ac131b3af 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -9,17 +9,18 @@
 package runtime
 #include "runtime.h"
 #include "arch_GOARCH.h"
-#include "stack.h"
 #include "malloc.h"
-#include "defs_GOOS_GOARCH.h"
 #include "type.h"
+#include "typekind.h"
+#include "race.h"
 
-#pragma dataflag 16 /* mark mheap as 'no pointers', hiding from garbage collector */
-MHeap runtime·mheap;
+MHeap *runtime·mheap;
 
-extern MStats mstats;	// defined in extern.go
+int32	runtime·checking;
 
-extern volatile int32 runtime·MemProfileRate;
+extern MStats mstats;	// defined in zruntime_def_$GOOS_$GOARCH.go
+
+extern volatile intgo runtime·MemProfileRate;
 
 // Allocate an object of at least size bytes.
 // Small objects are allocated from the per-thread cache's free lists.
@@ -27,7 +28,8 @@ extern volatile int32 runtime·MemProfileRate;
 void*
 runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 {
-	int32 sizeclass, rate;
+	int32 sizeclass;
+	intgo rate;
 	MCache *c;
 	uintptr npages;
 	MSpan *s;
@@ -41,6 +43,9 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 	if(size == 0)
 		size = 1;
 
+	if(DebugTypeAtBlockEnd)
+		size += sizeof(uintptr);
+
 	c = m->mcache;
 	c->local_nmalloc++;
 	if(size <= MaxSmallSize) {
@@ -60,7 +65,7 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		npages = size >> PageShift;
 		if((size & PageMask) != 0)
 			npages++;
-		s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1);
+		s = runtime·MHeap_Alloc(runtime·mheap, npages, 0, 1, zeroed);
 		if(s == nil)
 			runtime·throw("out of memory");
 		size = npages<<PageShift;
@@ -71,9 +76,20 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		// setup for mark sweep
 		runtime·markspan(v, 0, 0, true);
 	}
+
+	if (sizeof(void*) == 4 && c->local_total_alloc >= (1<<30)) {
+		// purge cache stats to prevent overflow
+		runtime·lock(runtime·mheap);
+		runtime·purgecachedstats(c);
+		runtime·unlock(runtime·mheap);
+	}
+
 	if(!(flag & FlagNoGC))
 		runtime·markallocated(v, size, (flag&FlagNoPointers) != 0);
 
+	if(DebugTypeAtBlockEnd)
+		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = 0;
+
 	m->mallocing = 0;
 
 	if(!(flag & FlagNoProfiling) && (rate = runtime·MemProfileRate) > 0) {
@@ -95,6 +111,11 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 
 	if(dogc && mstats.heap_alloc >= mstats.next_gc)
 		runtime·gc(0);
+
+	if(raceenabled) {
+		runtime·racemalloc(v, size, m->racepc);
+		m->racepc = nil;
+	}
 	return v;
 }
 
@@ -130,6 +151,9 @@ runtime·free(void *v)
 	}
 	prof = runtime·blockspecial(v);
 
+	if(raceenabled)
+		runtime·racefree(v);
+
 	// Find size class for v.
 	sizeclass = s->sizeclass;
 	c = m->mcache;
@@ -141,7 +165,7 @@ runtime·free(void *v)
 		// they might coalesce v into other spans and change the bitmap further.
 		runtime·markfreed(v, size);
 		runtime·unmarkspan(v, 1<<PageShift);
-		runtime·MHeap_Free(&runtime·mheap, s, 1);
+		runtime·MHeap_Free(runtime·mheap, s, 1);
 	} else {
 		// Small object.
 		size = runtime·class_to_size[sizeclass];
@@ -169,7 +193,14 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	MSpan *s;
 
 	m->mcache->local_nlookup++;
-	s = runtime·MHeap_LookupMaybe(&runtime·mheap, v);
+	if (sizeof(void*) == 4 && m->mcache->local_nlookup >= (1<<30)) {
+		// purge cache stats to prevent overflow
+		runtime·lock(runtime·mheap);
+		runtime·purgecachedstats(m->mcache);
+		runtime·unlock(runtime·mheap);
+	}
+
+	s = runtime·MHeap_LookupMaybe(runtime·mheap, v);
 	if(sp)
 		*sp = s;
 	if(s == nil) {
@@ -196,7 +227,7 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 		return 0;
 	}
 
-	n = runtime·class_to_size[s->sizeclass];
+	n = s->elemsize;
 	if(base) {
 		i = ((byte*)v - p)/n;
 		*base = p + i*n;
@@ -210,14 +241,15 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 MCache*
 runtime·allocmcache(void)
 {
-	int32 rate;
+	intgo rate;
 	MCache *c;
 
-	runtime·lock(&runtime·mheap);
-	c = runtime·FixAlloc_Alloc(&runtime·mheap.cachealloc);
-	mstats.mcache_inuse = runtime·mheap.cachealloc.inuse;
-	mstats.mcache_sys = runtime·mheap.cachealloc.sys;
-	runtime·unlock(&runtime·mheap);
+	runtime·lock(runtime·mheap);
+	c = runtime·FixAlloc_Alloc(&runtime·mheap->cachealloc);
+	mstats.mcache_inuse = runtime·mheap->cachealloc.inuse;
+	mstats.mcache_sys = runtime·mheap->cachealloc.sys;
+	runtime·unlock(runtime·mheap);
+	runtime·memclr((byte*)c, sizeof(*c));
 
 	// Set first allocation sample size.
 	rate = runtime·MemProfileRate;
@@ -230,12 +262,19 @@ runtime·allocmcache(void)
 }
 
 void
-runtime·purgecachedstats(M* m)
+runtime·freemcache(MCache *c)
 {
-	MCache *c;
+	runtime·MCache_ReleaseAll(c);
+	runtime·lock(runtime·mheap);
+	runtime·purgecachedstats(c);
+	runtime·FixAlloc_Free(&runtime·mheap->cachealloc, c);
+	runtime·unlock(runtime·mheap);
+}
 
+void
+runtime·purgecachedstats(MCache *c)
+{
 	// Protected by either heap or GC lock.
-	c = m->mcache;
 	mstats.heap_alloc += c->local_cachealloc;
 	c->local_cachealloc = 0;
 	mstats.heap_objects += c->local_objects;
@@ -274,6 +313,9 @@ runtime·mallocinit(void)
 	USED(arena_size);
 	USED(bitmap_size);
 
+	if((runtime·mheap = runtime·SysAlloc(sizeof(*runtime·mheap))) == nil)
+		runtime·throw("runtime: cannot allocate heap metadata");
+
 	runtime·InitSizes();
 
 	limit = runtime·memlimit();
@@ -283,32 +325,30 @@ runtime·mallocinit(void)
 	// enough to hold 4 bits per allocated word.
 	if(sizeof(void*) == 8 && (limit == 0 || limit > (1<<30))) {
 		// On a 64-bit machine, allocate from a single contiguous reservation.
-		// 16 GB should be big enough for now.
+		// 128 GB (MaxMem) should be big enough for now.
 		//
 		// The code will work with the reservation at any address, but ask
-		// SysReserve to use 0x000000f800000000 if possible.
-		// Allocating a 16 GB region takes away 36 bits, and the amd64
+		// SysReserve to use 0x000000c000000000 if possible.
+		// Allocating a 128 GB region takes away 37 bits, and the amd64
 		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
-		// in the middle of 0x00f8 for us to choose.  Choosing 0x00f8 means
-		// that the valid memory addresses will begin 0x00f8, 0x00f9, 0x00fa, 0x00fb.
-		// None of the bytes f8 f9 fa fb can appear in valid UTF-8, and
-		// they are otherwise as far from ff (likely a common byte) as possible.
-		// Choosing 0x00 for the leading 6 bits was more arbitrary, but it
-		// is not a common ASCII code point either.  Using 0x11f8 instead
+		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x0x00df.
+		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from 
+		// ff (likely a common byte) as possible. An earlier attempt to use 0x11f8 
 		// caused out of memory errors on OS X during thread allocations.
 		// These choices are both for debuggability and to reduce the
 		// odds of the conservative garbage collector not collecting memory
 		// because some non-pointer block of memory had a bit pattern
 		// that matched a memory address.
 		//
-		// Actually we reserve 17 GB (because the bitmap ends up being 1 GB)
-		// but it hardly matters: fc is not valid UTF-8 either, and we have to
-		// allocate 15 GB before we get that far.
+		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
+		// but it hardly matters: e0 00 is not valid UTF-8 either.
 		//
 		// If this fails we fall back to the 32 bit memory mechanism
-		arena_size = 16LL<<30;
+		arena_size = MaxMem;
 		bitmap_size = arena_size / (sizeof(void*)*8/4);
-		p = runtime·SysReserve((void*)(0x00f8ULL<<32), bitmap_size + arena_size);
+		p = runtime·SysReserve((void*)(0x00c0ULL<<32), bitmap_size + arena_size);
 	}
 	if (p == nil) {
 		// On a 32-bit machine, we can't typically get away
@@ -354,13 +394,13 @@ runtime·mallocinit(void)
 	if((uintptr)p & (((uintptr)1<<PageShift)-1))
 		runtime·throw("runtime: SysReserve returned unaligned address");
 
-	runtime·mheap.bitmap = p;
-	runtime·mheap.arena_start = p + bitmap_size;
-	runtime·mheap.arena_used = runtime·mheap.arena_start;
-	runtime·mheap.arena_end = runtime·mheap.arena_start + arena_size;
+	runtime·mheap->bitmap = p;
+	runtime·mheap->arena_start = p + bitmap_size;
+	runtime·mheap->arena_used = runtime·mheap->arena_start;
+	runtime·mheap->arena_end = runtime·mheap->arena_start + arena_size;
 
 	// Initialize the rest of the allocator.	
-	runtime·MHeap_Init(&runtime·mheap, runtime·SysAlloc);
+	runtime·MHeap_Init(runtime·mheap, runtime·SysAlloc);
 	m->mcache = runtime·allocmcache();
 
 	// See if it works.
@@ -394,6 +434,8 @@ runtime·MHeap_SysAlloc(MHeap *h, uintptr n)
 		runtime·SysMap(p, n);
 		h->arena_used += n;
 		runtime·MHeap_MapBits(h);
+		if(raceenabled)
+			runtime·racemapshadow(p, n);
 		return p;
 	}
 	
@@ -420,11 +462,231 @@ runtime·MHeap_SysAlloc(MHeap *h, uintptr n)
 		if(h->arena_used > h->arena_end)
 			h->arena_end = h->arena_used;
 		runtime·MHeap_MapBits(h);
+		if(raceenabled)
+			runtime·racemapshadow(p, n);
 	}
 	
 	return p;
 }
 
+static Lock settype_lock;
+
+void
+runtime·settype_flush(M *mp, bool sysalloc)
+{
+	uintptr *buf, *endbuf;
+	uintptr size, ofs, j, t;
+	uintptr ntypes, nbytes2, nbytes3;
+	uintptr *data2;
+	byte *data3;
+	bool sysalloc3;
+	void *v;
+	uintptr typ, p;
+	MSpan *s;
+
+	buf = mp->settype_buf;
+	endbuf = buf + mp->settype_bufsize;
+
+	runtime·lock(&settype_lock);
+	while(buf < endbuf) {
+		v = (void*)*buf;
+		*buf = 0;
+		buf++;
+		typ = *buf;
+		buf++;
+
+		// (Manually inlined copy of runtime·MHeap_Lookup)
+		p = (uintptr)v>>PageShift;
+		if(sizeof(void*) == 8)
+			p -= (uintptr)runtime·mheap->arena_start >> PageShift;
+		s = runtime·mheap->map[p];
+
+		if(s->sizeclass == 0) {
+			s->types.compression = MTypes_Single;
+			s->types.data = typ;
+			continue;
+		}
+
+		size = s->elemsize;
+		ofs = ((uintptr)v - (s->start<<PageShift)) / size;
+
+		switch(s->types.compression) {
+		case MTypes_Empty:
+			ntypes = (s->npages << PageShift) / size;
+			nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
+
+			if(!sysalloc) {
+				data3 = runtime·mallocgc(nbytes3, FlagNoPointers, 0, 1);
+			} else {
+				data3 = runtime·SysAlloc(nbytes3);
+				if(data3 == nil)
+					runtime·throw("runtime: cannot allocate memory");
+				if(0) runtime·printf("settype(0->3): SysAlloc(%x) --> %p\n", (uint32)nbytes3, data3);
+			}
+
+			s->types.compression = MTypes_Bytes;
+			s->types.sysalloc = sysalloc;
+			s->types.data = (uintptr)data3;
+
+			((uintptr*)data3)[1] = typ;
+			data3[8*sizeof(uintptr) + ofs] = 1;
+			break;
+
+		case MTypes_Words:
+			((uintptr*)s->types.data)[ofs] = typ;
+			break;
+
+		case MTypes_Bytes:
+			data3 = (byte*)s->types.data;
+			for(j=1; j<8; j++) {
+				if(((uintptr*)data3)[j] == typ) {
+					break;
+				}
+				if(((uintptr*)data3)[j] == 0) {
+					((uintptr*)data3)[j] = typ;
+					break;
+				}
+			}
+			if(j < 8) {
+				data3[8*sizeof(uintptr) + ofs] = j;
+			} else {
+				ntypes = (s->npages << PageShift) / size;
+				nbytes2 = ntypes * sizeof(uintptr);
+
+				if(!sysalloc) {
+					data2 = runtime·mallocgc(nbytes2, FlagNoPointers, 0, 1);
+				} else {
+					data2 = runtime·SysAlloc(nbytes2);
+					if(data2 == nil)
+						runtime·throw("runtime: cannot allocate memory");
+					if(0) runtime·printf("settype.(3->2): SysAlloc(%x) --> %p\n", (uint32)nbytes2, data2);
+				}
+
+				sysalloc3 = s->types.sysalloc;
+
+				s->types.compression = MTypes_Words;
+				s->types.sysalloc = sysalloc;
+				s->types.data = (uintptr)data2;
+
+				// Move the contents of data3 to data2. Then deallocate data3.
+				for(j=0; j<ntypes; j++) {
+					t = data3[8*sizeof(uintptr) + j];
+					t = ((uintptr*)data3)[t];
+					data2[j] = t;
+				}
+				if(sysalloc3) {
+					nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
+					if(0) runtime·printf("settype.(3->2): SysFree(%p,%x)\n", data3, (uint32)nbytes3);
+					runtime·SysFree(data3, nbytes3);
+				}
+
+				data2[ofs] = typ;
+			}
+			break;
+		}
+	}
+	runtime·unlock(&settype_lock);
+
+	mp->settype_bufsize = 0;
+}
+
+// It is forbidden to use this function if it is possible that
+// explicit deallocation via calling runtime·free(v) may happen.
+void
+runtime·settype(void *v, uintptr t)
+{
+	M *mp;
+	uintptr *buf;
+	uintptr i;
+	MSpan *s;
+
+	if(t == 0)
+		runtime·throw("settype: zero type");
+
+	mp = m;
+	buf = mp->settype_buf;
+	i = mp->settype_bufsize;
+	buf[i+0] = (uintptr)v;
+	buf[i+1] = t;
+	i += 2;
+	mp->settype_bufsize = i;
+
+	if(i == nelem(mp->settype_buf)) {
+		runtime·settype_flush(mp, false);
+	}
+
+	if(DebugTypeAtBlockEnd) {
+		s = runtime·MHeap_Lookup(runtime·mheap, v);
+		*(uintptr*)((uintptr)v+s->elemsize-sizeof(uintptr)) = t;
+	}
+}
+
+void
+runtime·settype_sysfree(MSpan *s)
+{
+	uintptr ntypes, nbytes;
+
+	if(!s->types.sysalloc)
+		return;
+
+	nbytes = (uintptr)-1;
+
+	switch (s->types.compression) {
+	case MTypes_Words:
+		ntypes = (s->npages << PageShift) / s->elemsize;
+		nbytes = ntypes * sizeof(uintptr);
+		break;
+	case MTypes_Bytes:
+		ntypes = (s->npages << PageShift) / s->elemsize;
+		nbytes = 8*sizeof(uintptr) + 1*ntypes;
+		break;
+	}
+
+	if(nbytes != (uintptr)-1) {
+		if(0) runtime·printf("settype: SysFree(%p,%x)\n", (void*)s->types.data, (uint32)nbytes);
+		runtime·SysFree((void*)s->types.data, nbytes);
+	}
+}
+
+uintptr
+runtime·gettype(void *v)
+{
+	MSpan *s;
+	uintptr t, ofs;
+	byte *data;
+
+	s = runtime·MHeap_LookupMaybe(runtime·mheap, v);
+	if(s != nil) {
+		t = 0;
+		switch(s->types.compression) {
+		case MTypes_Empty:
+			break;
+		case MTypes_Single:
+			t = s->types.data;
+			break;
+		case MTypes_Words:
+			ofs = (uintptr)v - (s->start<<PageShift);
+			t = ((uintptr*)s->types.data)[ofs/s->elemsize];
+			break;
+		case MTypes_Bytes:
+			ofs = (uintptr)v - (s->start<<PageShift);
+			data = (byte*)s->types.data;
+			t = data[8*sizeof(uintptr) + ofs/s->elemsize];
+			t = ((uintptr*)data)[t];
+			break;
+		default:
+			runtime·throw("runtime·gettype: invalid compression kind");
+		}
+		if(0) {
+			runtime·lock(&settype_lock);
+			runtime·printf("%p -> %d,%X\n", v, (int32)s->types.compression, (int64)t);
+			runtime·unlock(&settype_lock);
+		}
+		return t;
+	}
+	return 0;
+}
+
 // Runtime stubs.
 
 void*
@@ -433,46 +695,63 @@ runtime·mal(uintptr n)
 	return runtime·mallocgc(n, 0, 1, 1);
 }
 
-func new(typ *Type) (ret *uint8) {
-	uint32 flag = typ->kind&KindNoPointers ? FlagNoPointers : 0;
-	ret = runtime·mallocgc(typ->size, flag, 1, 1);
+#pragma textflag 7
+void
+runtime·new(Type *typ, uint8 *ret)
+{
+	uint32 flag;
+
+	if(raceenabled)
+		m->racepc = runtime·getcallerpc(&typ);
+
+	if(typ->size == 0) {
+		// All 0-length allocations use this pointer.
+		// The language does not require the allocations to
+		// have distinct values.
+		ret = (uint8*)&runtime·zerobase;
+	} else {
+		flag = typ->kind&KindNoPointers ? FlagNoPointers : 0;
+		ret = runtime·mallocgc(typ->size, flag, 1, 1);
+
+		if(UseSpanType && !flag) {
+			if(false) {
+				runtime·printf("new %S: %p\n", *typ->string, ret);
+			}
+			runtime·settype(ret, (uintptr)typ | TypeInfo_SingleObject);
+		}
+	}
+
 	FLUSH(&ret);
 }
 
+// same as runtime·new, but callable from C
 void*
-runtime·stackalloc(uint32 n)
+runtime·cnew(Type *typ)
 {
-	// Stackalloc must be called on scheduler stack, so that we
-	// never try to grow the stack during the code that stackalloc runs.
-	// Doing so would cause a deadlock (issue 1547).
-	if(g != m->g0)
-		runtime·throw("stackalloc not on scheduler stack");
-
-	// Stack allocator uses malloc/free most of the time,
-	// but if we're in the middle of malloc and need stack,
-	// we have to do something else to avoid deadlock.
-	// In that case, we fall back on a fixed-size free-list
-	// allocator, assuming that inside malloc all the stack
-	// frames are small, so that all the stack allocations
-	// will be a single size, the minimum (right now, 5k).
-	if(m->mallocing || m->gcing || n == FixedStack) {
-		if(n != FixedStack) {
-			runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
-			runtime·throw("stackalloc");
+	uint32 flag;
+	void *ret;
+
+	if(raceenabled)
+		m->racepc = runtime·getcallerpc(&typ);
+
+	if(typ->size == 0) {
+		// All 0-length allocations use this pointer.
+		// The language does not require the allocations to
+		// have distinct values.
+		ret = (uint8*)&runtime·zerobase;
+	} else {
+		flag = typ->kind&KindNoPointers ? FlagNoPointers : 0;
+		ret = runtime·mallocgc(typ->size, flag, 1, 1);
+
+		if(UseSpanType && !flag) {
+			if(false) {
+				runtime·printf("new %S: %p\n", *typ->string, ret);
+			}
+			runtime·settype(ret, (uintptr)typ | TypeInfo_SingleObject);
 		}
-		return runtime·FixAlloc_Alloc(m->stackalloc);
 	}
-	return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
-}
 
-void
-runtime·stackfree(void *v, uintptr n)
-{
-	if(m->mallocing || m->gcing || n == FixedStack) {
-		runtime·FixAlloc_Free(m->stackalloc, v);
-		return;
-	}
-	runtime·free(v);
+	return ret;
 }
 
 func GC() {
@@ -483,7 +762,8 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 	byte *base;
 	uintptr size;
 	FuncType *ft;
-	int32 i, nret;
+	int32 i;
+	uintptr nret;
 	Type *t;
 
 	if(obj.type == nil) {
diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h
index d846f6810..38122bf8a 100644
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@@ -85,6 +85,8 @@ typedef struct MHeap	MHeap;
 typedef struct MSpan	MSpan;
 typedef struct MStats	MStats;
 typedef struct MLink	MLink;
+typedef struct MTypes	MTypes;
+typedef struct GCStats	GCStats;
 
 enum
 {
@@ -113,21 +115,30 @@ enum
 	HeapAllocChunk = 1<<20,		// Chunk size for heap growth
 
 	// Number of bits in page to span calculations (4k pages).
-	// On 64-bit, we limit the arena to 16G, so 22 bits suffices.
-	// On 32-bit, we don't bother limiting anything: 20 bits for 4G.
+	// On 64-bit, we limit the arena to 128GB, or 37 bits.
+	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
 #ifdef _64BIT
-	MHeapMap_Bits = 22,
+	MHeapMap_Bits = 37 - PageShift,
 #else
-	MHeapMap_Bits = 20,
+	MHeapMap_Bits = 32 - PageShift,
 #endif
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
 	// on the hardware details of the machine.  The garbage
-	// collector scales well to 4 cpus.
-	MaxGcproc = 4,
+	// collector scales well to 8 cpus.
+	MaxGcproc = 8,
 };
 
+// Maximum memory allocation size, a hint for callers.
+// This must be a #define instead of an enum because it
+// is so large.
+#ifdef _64BIT
+#define	MaxMem	(1ULL<<(MHeapMap_Bits+PageShift))	/* 128 GB */
+#else
+#define	MaxMem	((uintptr)-1)
+#endif
+
 // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
 struct MLink
 {
@@ -188,7 +199,7 @@ void	runtime·FixAlloc_Free(FixAlloc *f, void *p);
 
 
 // Statistics.
-// Shared with Go: if you edit this structure, also edit extern.go.
+// Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
 struct MStats
 {
 	// General statistics.
@@ -219,7 +230,7 @@ struct MStats
 	uint64	buckhash_sys;	// profiling bucket hash table
 
 	// Statistics about garbage collector.
-	// Protected by stopping the world during GC.
+	// Protected by mheap or stopping the world during GC.
 	uint64	next_gc;	// next GC (in heap_alloc time)
 	uint64  last_gc;	// last GC (in absolute time)
 	uint64	pause_total_ns;
@@ -239,7 +250,6 @@ struct MStats
 #define mstats runtime·memStats	/* name shared with Go */
 extern MStats mstats;
 
-
 // Size classes.  Computed and initialized by InitSizes.
 //
 // SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
@@ -273,19 +283,19 @@ struct MCacheList
 struct MCache
 {
 	MCacheList list[NumSizeClasses];
-	uint64 size;
-	int64 local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
-	int64 local_objects;	// objects allocated (or freed) from cache since last lock of heap
-	int64 local_alloc;	// bytes allocated (or freed) since last lock of heap
-	int64 local_total_alloc;	// bytes allocated (even if freed) since last lock of heap
-	int64 local_nmalloc;	// number of mallocs since last lock of heap
-	int64 local_nfree;	// number of frees since last lock of heap
-	int64 local_nlookup;	// number of pointer lookups since last lock of heap
+	uintptr size;
+	intptr local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
+	intptr local_objects;	// objects allocated (or freed) from cache since last lock of heap
+	intptr local_alloc;	// bytes allocated (or freed) since last lock of heap
+	uintptr local_total_alloc;	// bytes allocated (even if freed) since last lock of heap
+	uintptr local_nmalloc;	// number of mallocs since last lock of heap
+	uintptr local_nfree;	// number of frees since last lock of heap
+	uintptr local_nlookup;	// number of pointer lookups since last lock of heap
 	int32 next_sample;	// trigger heap sample after allocating this many bytes
 	// Statistics about allocation size classes since last lock of heap
 	struct {
-		int64 nmalloc;
-		int64 nfree;
+		uintptr nmalloc;
+		uintptr nfree;
 	} local_by_size[NumSizeClasses];
 
 };
@@ -294,6 +304,44 @@ void*	runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zero
 void	runtime·MCache_Free(MCache *c, void *p, int32 sizeclass, uintptr size);
 void	runtime·MCache_ReleaseAll(MCache *c);
 
+// MTypes describes the types of blocks allocated within a span.
+// The compression field describes the layout of the data.
+//
+// MTypes_Empty:
+//     All blocks are free, or no type information is available for
+//     allocated blocks.
+//     The data field has no meaning.
+// MTypes_Single:
+//     The span contains just one block.
+//     The data field holds the type information.
+//     The sysalloc field has no meaning.
+// MTypes_Words:
+//     The span contains multiple blocks.
+//     The data field points to an array of type [NumBlocks]uintptr,
+//     and each element of the array holds the type of the corresponding
+//     block.
+// MTypes_Bytes:
+//     The span contains at most seven different types of blocks.
+//     The data field points to the following structure:
+//         struct {
+//             type  [8]uintptr       // type[0] is always 0
+//             index [NumBlocks]byte
+//         }
+//     The type of the i-th block is: data.type[data.index[i]]
+enum
+{
+	MTypes_Empty = 0,
+	MTypes_Single = 1,
+	MTypes_Words = 2,
+	MTypes_Bytes = 3,
+};
+struct MTypes
+{
+	byte	compression;	// one of MTypes_*
+	bool	sysalloc;	// whether (void*)data is from runtime·SysAlloc
+	uintptr	data;
+};
+
 // An MSpan is a run of pages.
 enum
 {
@@ -306,16 +354,17 @@ struct MSpan
 {
 	MSpan	*next;		// in a span linked list
 	MSpan	*prev;		// in a span linked list
-	MSpan	*allnext;	// in the list of all spans
 	PageID	start;		// starting page number
 	uintptr	npages;		// number of pages in span
 	MLink	*freelist;	// list of free objects
 	uint32	ref;		// number of allocated objects in this span
-	uint32	sizeclass;	// size class
+	int32	sizeclass;	// size class
+	uintptr	elemsize;	// computed from sizeclass or from npages
 	uint32	state;		// MSpanInUse etc
 	int64   unusedsince;	// First time spotted by GC in MSpanFree state
 	uintptr npreleased;	// number of pages released to the OS
 	byte	*limit;		// end of data in span
+	MTypes	types;		// types of allocated objects in this span
 };
 
 void	runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages);
@@ -342,6 +391,7 @@ struct MCentral
 void	runtime·MCentral_Init(MCentral *c, int32 sizeclass);
 int32	runtime·MCentral_AllocList(MCentral *c, int32 n, MLink **first);
 void	runtime·MCentral_FreeList(MCentral *c, int32 n, MLink *first);
+void	runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end);
 
 // Main malloc heap.
 // The heap itself is the "free[]" and "large" arrays,
@@ -351,7 +401,9 @@ struct MHeap
 	Lock;
 	MSpan free[MaxMHeapList];	// free lists of given length
 	MSpan large;			// free lists length >= MaxMHeapList
-	MSpan *allspans;
+	MSpan **allspans;
+	uint32	nspan;
+	uint32	nspancap;
 
 	// span lookup
 	MSpan *map[1<<MHeapMap_Bits];
@@ -375,10 +427,10 @@ struct MHeap
 	FixAlloc spanalloc;	// allocator for Span*
 	FixAlloc cachealloc;	// allocator for MCache*
 };
-extern MHeap runtime·mheap;
+extern MHeap *runtime·mheap;
 
 void	runtime·MHeap_Init(MHeap *h, void *(*allocator)(uintptr));
-MSpan*	runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct);
+MSpan*	runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed);
 void	runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct);
 MSpan*	runtime·MHeap_Lookup(MHeap *h, void *v);
 MSpan*	runtime·MHeap_LookupMaybe(MHeap *h, void *v);
@@ -394,12 +446,18 @@ void	runtime·markallocated(void *v, uintptr n, bool noptr);
 void	runtime·checkallocated(void *v, uintptr n);
 void	runtime·markfreed(void *v, uintptr n);
 void	runtime·checkfreed(void *v, uintptr n);
-int32	runtime·checking;
+extern	int32	runtime·checking;
 void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
 void	runtime·unmarkspan(void *v, uintptr size);
 bool	runtime·blockspecial(void*);
 void	runtime·setblockspecial(void*, bool);
-void	runtime·purgecachedstats(M*);
+void	runtime·purgecachedstats(MCache*);
+void*	runtime·cnew(Type*);
+
+void	runtime·settype(void*, uintptr);
+void	runtime·settype_flush(M*, bool);
+void	runtime·settype_sysfree(MSpan*);
+uintptr	runtime·gettype(void*);
 
 enum
 {
@@ -412,8 +470,26 @@ enum
 void	runtime·MProf_Malloc(void*, uintptr);
 void	runtime·MProf_Free(void*, uintptr);
 void	runtime·MProf_GC(void);
-int32	runtime·helpgc(bool*);
+int32	runtime·gcprocs(void);
+void	runtime·helpgc(int32 nproc);
 void	runtime·gchelper(void);
 
-bool	runtime·getfinalizer(void *p, bool del, void (**fn)(void*), int32 *nret);
+bool	runtime·getfinalizer(void *p, bool del, FuncVal **fn, uintptr *nret);
 void	runtime·walkfintab(void (*fn)(void*));
+
+enum
+{
+	TypeInfo_SingleObject = 0,
+	TypeInfo_Array = 1,
+	TypeInfo_Map = 2,
+	TypeInfo_Chan = 3,
+
+	// Enables type information at the end of blocks allocated from heap	
+	DebugTypeAtBlockEnd = 0,
+};
+
+// defined in mgc0.go
+void	runtime·gc_m_ptr(Eface*);
+void	runtime·gc_itab_ptr(Eface*);
+
+void	runtime·memorydump(void);
diff --git a/src/pkg/runtime/mallocrep1.go b/src/pkg/runtime/mallocrep1.go
index 41c104c0b..bc33e3a6b 100644
--- a/src/pkg/runtime/mallocrep1.go
+++ b/src/pkg/runtime/mallocrep1.go
@@ -39,6 +39,7 @@ func OkAmount(size, n uintptr) bool {
 }
 
 func AllocAndFree(size, count int) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
 	if *chatty {
 		fmt.Printf("size=%d count=%d ...\n", size, count)
 	}
diff --git a/src/pkg/runtime/mcache.c b/src/pkg/runtime/mcache.c
index 518e00c12..64803e703 100644
--- a/src/pkg/runtime/mcache.c
+++ b/src/pkg/runtime/mcache.c
@@ -21,7 +21,7 @@ runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed)
 	l = &c->list[sizeclass];
 	if(l->list == nil) {
 		// Replenish using central lists.
-		n = runtime·MCentral_AllocList(&runtime·mheap.central[sizeclass],
+		n = runtime·MCentral_AllocList(&runtime·mheap->central[sizeclass],
 			runtime·class_to_transfercount[sizeclass], &first);
 		if(n == 0)
 			runtime·throw("out of memory");
@@ -43,11 +43,6 @@ runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed)
 		// block is zeroed iff second word is zero ...
 		if(size > sizeof(uintptr) && ((uintptr*)v)[1] != 0)
 			runtime·memclr((byte*)v, size);
-		else {
-			// ... except for the link pointer
-			// that we used above; zero that.
-			v->next = nil;
-		}
 	}
 	c->local_cachealloc += size;
 	c->local_objects++;
@@ -74,7 +69,7 @@ ReleaseN(MCache *c, MCacheList *l, int32 n, int32 sizeclass)
 	c->size -= n*runtime·class_to_size[sizeclass];
 
 	// Return them to central free list.
-	runtime·MCentral_FreeList(&runtime·mheap.central[sizeclass], n, first);
+	runtime·MCentral_FreeList(&runtime·mheap->central[sizeclass], n, first);
 }
 
 void
diff --git a/src/pkg/runtime/mcentral.c b/src/pkg/runtime/mcentral.c
index ff0c2d11a..ac8b5aa0d 100644
--- a/src/pkg/runtime/mcentral.c
+++ b/src/pkg/runtime/mcentral.c
@@ -34,12 +34,13 @@ runtime·MCentral_Init(MCentral *c, int32 sizeclass)
 // Allocate up to n objects from the central free list.
 // Return the number of objects allocated.
 // The objects are linked together by their first words.
-// On return, *pstart points at the first object and *pend at the last.
+// On return, *pstart points at the first object.
 int32
 runtime·MCentral_AllocList(MCentral *c, int32 n, MLink **pfirst)
 {
-	MLink *first, *last, *v;
-	int32 i;
+	MSpan *s;
+	MLink *first, *last;
+	int32 cap, avail, i;
 
 	runtime·lock(c);
 	// Replenish central list if empty.
@@ -50,47 +51,37 @@ runtime·MCentral_AllocList(MCentral *c, int32 n, MLink **pfirst)
 			return 0;
 		}
 	}
+	s = c->nonempty.next;
+	cap = (s->npages << PageShift) / s->elemsize;
+	avail = cap - s->ref;
+	if(avail < n)
+		n = avail;
 
-	// Copy from list, up to n.
 	// First one is guaranteed to work, because we just grew the list.
-	first = MCentral_Alloc(c);
+	first = s->freelist;
 	last = first;
-	for(i=1; i<n && (v = MCentral_Alloc(c)) != nil; i++) {
-		last->next = v;
-		last = v;
+	for(i=1; i<n; i++) {
+		last = last->next;
 	}
+	s->freelist = last->next;
 	last->next = nil;
-	c->nfree -= i;
-
-	runtime·unlock(c);
-	*pfirst = first;
-	return i;
-}
+	s->ref += n;
+	c->nfree -= n;
 
-// Helper: allocate one object from the central free list.
-static void*
-MCentral_Alloc(MCentral *c)
-{
-	MSpan *s;
-	MLink *v;
-
-	if(runtime·MSpanList_IsEmpty(&c->nonempty))
-		return nil;
-	s = c->nonempty.next;
-	s->ref++;
-	v = s->freelist;
-	s->freelist = v->next;
-	if(s->freelist == nil) {
+	if(n == avail) {
+		if(s->freelist != nil || s->ref != cap) {
+			runtime·throw("invalid freelist");
+		}
 		runtime·MSpanList_Remove(s);
 		runtime·MSpanList_Insert(&c->empty, s);
 	}
-	return v;
+
+	runtime·unlock(c);
+	*pfirst = first;
+	return n;
 }
 
 // Free n objects back into the central free list.
-// Return the number of objects allocated.
-// The objects are linked together by their first words.
-// On return, *pstart points at the first object and *pend at the last.
 void
 runtime·MCentral_FreeList(MCentral *c, int32 n, MLink *start)
 {
@@ -118,7 +109,7 @@ MCentral_Free(MCentral *c, void *v)
 	int32 size;
 
 	// Find span for v.
-	s = runtime·MHeap_Lookup(&runtime·mheap, v);
+	s = runtime·MHeap_Lookup(runtime·mheap, v);
 	if(s == nil || s->ref == 0)
 		runtime·throw("invalid free");
 
@@ -143,11 +134,47 @@ MCentral_Free(MCentral *c, void *v)
 		s->freelist = nil;
 		c->nfree -= (s->npages << PageShift) / size;
 		runtime·unlock(c);
-		runtime·MHeap_Free(&runtime·mheap, s, 0);
+		runtime·MHeap_Free(runtime·mheap, s, 0);
 		runtime·lock(c);
 	}
 }
 
+// Free n objects from a span s back into the central free list c.
+// Called from GC.
+void
+runtime·MCentral_FreeSpan(MCentral *c, MSpan *s, int32 n, MLink *start, MLink *end)
+{
+	int32 size;
+
+	runtime·lock(c);
+
+	// Move to nonempty if necessary.
+	if(s->freelist == nil) {
+		runtime·MSpanList_Remove(s);
+		runtime·MSpanList_Insert(&c->nonempty, s);
+	}
+
+	// Add the objects back to s's free list.
+	end->next = s->freelist;
+	s->freelist = start;
+	s->ref -= n;
+	c->nfree += n;
+
+	// If s is completely freed, return it to the heap.
+	if(s->ref == 0) {
+		size = runtime·class_to_size[c->sizeclass];
+		runtime·MSpanList_Remove(s);
+		*(uintptr*)(s->start<<PageShift) = 1;  // needs zeroing
+		s->freelist = nil;
+		c->nfree -= (s->npages << PageShift) / size;
+		runtime·unlock(c);
+		runtime·unmarkspan((byte*)(s->start<<PageShift), s->npages<<PageShift);
+		runtime·MHeap_Free(runtime·mheap, s, 0);
+	} else {
+		runtime·unlock(c);
+	}
+}
+
 void
 runtime·MGetSizeClassInfo(int32 sizeclass, uintptr *sizep, int32 *npagesp, int32 *nobj)
 {
@@ -174,7 +201,7 @@ MCentral_Grow(MCentral *c)
 
 	runtime·unlock(c);
 	runtime·MGetSizeClassInfo(c->sizeclass, &size, &npages, &n);
-	s = runtime·MHeap_Alloc(&runtime·mheap, npages, c->sizeclass, 0);
+	s = runtime·MHeap_Alloc(runtime·mheap, npages, c->sizeclass, 0, 1);
 	if(s == nil) {
 		// TODO(rsc): Log out of memory
 		runtime·lock(c);
diff --git a/src/pkg/runtime/mem.go b/src/pkg/runtime/mem.go
index 76680086c..79edc5a60 100644
--- a/src/pkg/runtime/mem.go
+++ b/src/pkg/runtime/mem.go
@@ -6,6 +6,9 @@ package runtime
 
 import "unsafe"
 
+// Note: the MemStats struct should be kept in sync with
+// struct MStats in malloc.h
+
 // A MemStats records statistics about the memory allocator.
 type MemStats struct {
 	// General statistics.
@@ -39,7 +42,7 @@ type MemStats struct {
 	NextGC       uint64 // next run in HeapAlloc time (bytes)
 	LastGC       uint64 // last run in absolute time (ns)
 	PauseTotalNs uint64
-	PauseNs      [256]uint64 // most recent GC pause times
+	PauseNs      [256]uint64 // circular buffer of recent GC pause times, most recent at [(NumGC+255)%256]
 	NumGC        uint32
 	EnableGC     bool
 	DebugGC      bool
diff --git a/src/pkg/runtime/mem_darwin.c b/src/pkg/runtime/mem_darwin.c
index cde5601cf..04e719394 100644
--- a/src/pkg/runtime/mem_darwin.c
+++ b/src/pkg/runtime/mem_darwin.c
@@ -14,7 +14,7 @@ runtime·SysAlloc(uintptr n)
 	void *v;
 
 	mstats.sys += n;
-	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 	if(v < (void*)4096)
 		return nil;
 	return v;
@@ -51,7 +51,7 @@ runtime·SysMap(void *v, uintptr n)
 	void *p;
 	
 	mstats.sys += n;
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
 	if(p == (void*)-ENOMEM)
 		runtime·throw("runtime: out of memory");
 	if(p != v)
diff --git a/src/pkg/runtime/mem_freebsd.c b/src/pkg/runtime/mem_freebsd.c
index d1c22583d..f217e9db1 100644
--- a/src/pkg/runtime/mem_freebsd.c
+++ b/src/pkg/runtime/mem_freebsd.c
@@ -14,7 +14,7 @@ runtime·SysAlloc(uintptr n)
 	void *v;
 
 	mstats.sys += n;
-	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 	if(v < (void*)4096)
 		return nil;
 	return v;
@@ -23,9 +23,7 @@ runtime·SysAlloc(uintptr n)
 void
 runtime·SysUnused(void *v, uintptr n)
 {
-	USED(v);
-	USED(n);
-	// TODO(rsc): call madvise MADV_DONTNEED
+	runtime·madvise(v, n, MADV_FREE);
 }
 
 void
@@ -61,7 +59,7 @@ runtime·SysMap(void *v, uintptr n)
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.
 	if(sizeof(void*) == 8) {
-		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 		if(p == (void*)-ENOMEM)
 			runtime·throw("runtime: out of memory");
 		if(p != v) {
@@ -71,7 +69,7 @@ runtime·SysMap(void *v, uintptr n)
 		return;
 	}
 
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
 	if(p == (void*)-ENOMEM)
 		runtime·throw("runtime: out of memory");
 	if(p != v)
diff --git a/src/pkg/runtime/mem_linux.c b/src/pkg/runtime/mem_linux.c
index b3e79cc41..ebcec1e86 100644
--- a/src/pkg/runtime/mem_linux.c
+++ b/src/pkg/runtime/mem_linux.c
@@ -10,6 +10,7 @@
 
 enum
 {
+	EAGAIN = 11,
 	ENOMEM = 12,
 	_PAGE_SIZE = 4096,
 };
@@ -56,13 +57,17 @@ runtime·SysAlloc(uintptr n)
 	void *p;
 
 	mstats.sys += n;
-	p = runtime·mmap(nil, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+	p = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 	if(p < (void*)4096) {
 		if(p == (void*)EACCES) {
 			runtime·printf("runtime: mmap: access denied\n");
 			runtime·printf("if you're running SELinux, enable execmem for this process.\n");
 			runtime·exit(2);
 		}
+		if(p == (void*)EAGAIN) {
+			runtime·printf("runtime: mmap: too much locked memory (check 'ulimit -l').\n");
+			runtime·exit(2);
+		}
 		return nil;
 	}
 	return p;
@@ -113,7 +118,7 @@ runtime·SysMap(void *v, uintptr n)
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.
 	if(sizeof(void*) == 8 && (uintptr)v >= 0xffffffffU) {
-		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+		p = mmap_fixed(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 		if(p == (void*)ENOMEM)
 			runtime·throw("runtime: out of memory");
 		if(p != v) {
@@ -123,7 +128,7 @@ runtime·SysMap(void *v, uintptr n)
 		return;
 	}
 
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
 	if(p == (void*)ENOMEM)
 		runtime·throw("runtime: out of memory");
 	if(p != v)
diff --git a/src/pkg/runtime/mem_netbsd.c b/src/pkg/runtime/mem_netbsd.c
index 34ff31d90..77ce04c4e 100644
--- a/src/pkg/runtime/mem_netbsd.c
+++ b/src/pkg/runtime/mem_netbsd.c
@@ -19,7 +19,7 @@ runtime·SysAlloc(uintptr n)
 	void *v;
 
 	mstats.sys += n;
-	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 	if(v < (void*)4096)
 		return nil;
 	return v;
@@ -28,9 +28,7 @@ runtime·SysAlloc(uintptr n)
 void
 runtime·SysUnused(void *v, uintptr n)
 {
-	USED(v);
-	USED(n);
-	// TODO(rsc): call madvise MADV_DONTNEED
+	runtime·madvise(v, n, MADV_FREE);
 }
 
 void
@@ -67,7 +65,7 @@ runtime·SysMap(void *v, uintptr n)
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.
 	if(sizeof(void*) == 8) {
-		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 		if(p == (void*)-ENOMEM)
 			runtime·throw("runtime: out of memory");
 		if(p != v) {
@@ -77,7 +75,7 @@ runtime·SysMap(void *v, uintptr n)
 		return;
 	}
 
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
 	if(p == (void*)-ENOMEM)
 		runtime·throw("runtime: out of memory");
 	if(p != v)
diff --git a/src/pkg/runtime/mem_openbsd.c b/src/pkg/runtime/mem_openbsd.c
index 34ff31d90..77ce04c4e 100644
--- a/src/pkg/runtime/mem_openbsd.c
+++ b/src/pkg/runtime/mem_openbsd.c
@@ -19,7 +19,7 @@ runtime·SysAlloc(uintptr n)
 	void *v;
 
 	mstats.sys += n;
-	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+	v = runtime·mmap(nil, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 	if(v < (void*)4096)
 		return nil;
 	return v;
@@ -28,9 +28,7 @@ runtime·SysAlloc(uintptr n)
 void
 runtime·SysUnused(void *v, uintptr n)
 {
-	USED(v);
-	USED(n);
-	// TODO(rsc): call madvise MADV_DONTNEED
+	runtime·madvise(v, n, MADV_FREE);
 }
 
 void
@@ -67,7 +65,7 @@ runtime·SysMap(void *v, uintptr n)
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.
 	if(sizeof(void*) == 8) {
-		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_PRIVATE, -1, 0);
+		p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
 		if(p == (void*)-ENOMEM)
 			runtime·throw("runtime: out of memory");
 		if(p != v) {
@@ -77,7 +75,7 @@ runtime·SysMap(void *v, uintptr n)
 		return;
 	}
 
-	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
+	p = runtime·mmap(v, n, PROT_READ|PROT_WRITE, MAP_ANON|MAP_FIXED|MAP_PRIVATE, -1, 0);
 	if(p == (void*)-ENOMEM)
 		runtime·throw("runtime: out of memory");
 	if(p != v)
diff --git a/src/pkg/runtime/mem_plan9.c b/src/pkg/runtime/mem_plan9.c
index 15cbc176b..26ca367f1 100644
--- a/src/pkg/runtime/mem_plan9.c
+++ b/src/pkg/runtime/mem_plan9.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
 #include "os_GOOS.h"
@@ -13,14 +14,14 @@ static Lock memlock;
 
 enum
 {
-	Round = 4095
+	Round = PAGESIZE-1
 };
 
 void*
 runtime·SysAlloc(uintptr nbytes)
 {
 	uintptr bl;
-	
+
 	runtime·lock(&memlock);
 	mstats.sys += nbytes;
 	// Plan 9 sbrk from /sys/src/libc/9sys/sbrk.c
diff --git a/src/pkg/runtime/memset_arm.s b/src/pkg/runtime/memclr_arm.s
index 974b8da7a..afc529d90 100644
--- a/src/pkg/runtime/memset_arm.s
+++ b/src/pkg/runtime/memclr_arm.s
@@ -23,35 +23,32 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-TO = 1
-TOE = 2
-N = 3
-TMP = 3					/* N and TMP don't overlap */
-
-// TODO(kaib): memset clobbers R9 and R10 (m and g). This makes the
-// registers unpredictable if (when) memset SIGSEGV's. Fix it by
-// moving the R4-R11 register bank.
-TEXT runtime·memset(SB), $0
-	MOVW	R0, R(TO)
-	MOVW	data+4(FP), R(4)
-	MOVW	n+8(FP), R(N)
+TO = 8
+TOE = 11
+N = 12
+TMP = 12				/* N and TMP don't overlap */
+
+TEXT runtime·memclr(SB),7,$0
+	MOVW	ptr+0(FP), R(TO)
+	MOVW	n+4(FP), R(N)
+	MOVW	$0, R(0)
 
 	ADD	R(N), R(TO), R(TOE)	/* to end pointer */
 
 	CMP	$4, R(N)		/* need at least 4 bytes to copy */
 	BLT	_1tail
 
-	AND	$0xFF, R(4)		/* it's a byte */
-	SLL	$8, R(4), R(TMP)	/* replicate to a word */
-	ORR	R(TMP), R(4)
-	SLL	$16, R(4), R(TMP)
-	ORR	R(TMP), R(4)
+	AND	$0xFF, R(0)		/* it's a byte */
+	SLL	$8, R(0), R(TMP)	/* replicate to a word */
+	ORR	R(TMP), R(0)
+	SLL	$16, R(0), R(TMP)
+	ORR	R(TMP), R(0)
 
 _4align:				/* align on 4 */
 	AND.S	$3, R(TO), R(TMP)
 	BEQ	_4aligned
 
-	MOVBU.P	R(4), 1(R(TO))		/* implicit write back */
+	MOVBU.P	R(0), 1(R(TO))		/* implicit write back */
 	B	_4align
 
 _4aligned:
@@ -59,19 +56,19 @@ _4aligned:
 	CMP	R(TMP), R(TO)
 	BHS	_4tail
 
-	MOVW	R4, R5			/* replicate */
-	MOVW	R4, R6
-	MOVW	R4, R7
-	MOVW	R4, R8
-	MOVW	R4, R9
-	MOVW	R4, R10
-	MOVW	R4, R11
+	MOVW	R0, R1			/* replicate */
+	MOVW	R0, R2
+	MOVW	R0, R3
+	MOVW	R0, R4
+	MOVW	R0, R5
+	MOVW	R0, R6
+	MOVW	R0, R7
 
 _f32loop:
 	CMP	R(TMP), R(TO)
 	BHS	_4tail
 
-	MOVM.IA.W [R4-R11], (R(TO))
+	MOVM.IA.W [R0-R7], (R(TO))
 	B	_f32loop
 
 _4tail:
@@ -80,14 +77,14 @@ _4loop:
 	CMP	R(TMP), R(TO)
 	BHS	_1tail
 
-	MOVW.P	R(4), 4(R(TO))		/* implicit write back */
+	MOVW.P	R(0), 4(R(TO))		/* implicit write back */
 	B	_4loop
 
 _1tail:
 	CMP	R(TO), R(TOE)
 	BEQ	_return
 
-	MOVBU.P	R(4), 1(R(TO))		/* implicit write back */
+	MOVBU.P	R(0), 1(R(TO))		/* implicit write back */
 	B	_1tail
 
 _return:
diff --git a/src/pkg/runtime/memmove_arm.s b/src/pkg/runtime/memmove_arm.s
index 5c0e57404..c5d7e9d70 100644
--- a/src/pkg/runtime/memmove_arm.s
+++ b/src/pkg/runtime/memmove_arm.s
@@ -23,19 +23,40 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
+// TE or TS are spilled to the stack during bulk register moves.
 TS = 0
-TE = 1
-FROM = 2
-N = 3
-TMP = 3					/* N and TMP don't overlap */
-TMP1 = 4
-
-// TODO(kaib): This can be done with the existing registers of LR is re-used. Same for memset.
-TEXT runtime·memmove(SB), 7, $8
-	// save g and m
-	MOVW	R9, 4(R13)
-	MOVW	R10, 8(R13)
-
+TE = 8
+
+// Warning: the linker will use R11 to synthesize certain instructions. Please
+// take care and double check with objdump.
+FROM = 11
+N = 12
+TMP = 12				/* N and TMP don't overlap */
+TMP1 = 5
+
+RSHIFT = 5
+LSHIFT = 6
+OFFSET = 7
+
+BR0 = 0					/* shared with TS */
+BW0 = 1
+BR1 = 1
+BW1 = 2
+BR2 = 2
+BW2 = 3
+BR3 = 3
+BW3 = 4
+
+FW0 = 1
+FR0 = 2
+FW1 = 2
+FR1 = 3
+FW2 = 3
+FR2 = 4
+FW3 = 4
+FR3 = 8					/* shared with TE */
+
+TEXT runtime·memmove(SB), 7, $4
 _memmove:
 	MOVW	to+0(FP), R(TS)
 	MOVW	from+4(FP), R(FROM)
@@ -64,15 +85,17 @@ _b4aligned:				/* is source now aligned? */
 	BNE	_bunaligned
 
 	ADD	$31, R(TS), R(TMP)	/* do 32-byte chunks if possible */
+	MOVW	R(TS), savedts+4(SP)
 _b32loop:
 	CMP	R(TMP), R(TE)
 	BLS	_b4tail
 
-	MOVM.DB.W (R(FROM)), [R4-R11]
-	MOVM.DB.W [R4-R11], (R(TE))
+	MOVM.DB.W (R(FROM)), [R0-R7]
+	MOVM.DB.W [R0-R7], (R(TE))
 	B	_b32loop
 
 _b4tail:				/* do remaining words if possible */
+	MOVW	savedts+4(SP), R(TS)
 	ADD	$3, R(TS), R(TMP)
 _b4loop:
 	CMP	R(TMP), R(TE)
@@ -107,22 +130,24 @@ _f4aligned:				/* is source now aligned? */
 	BNE	_funaligned
 
 	SUB	$31, R(TE), R(TMP)	/* do 32-byte chunks if possible */
+	MOVW	R(TE), savedte+4(SP)
 _f32loop:
 	CMP	R(TMP), R(TS)
 	BHS	_f4tail
 
-	MOVM.IA.W (R(FROM)), [R4-R11] 
-	MOVM.IA.W [R4-R11], (R(TS))
+	MOVM.IA.W (R(FROM)), [R1-R8] 
+	MOVM.IA.W [R1-R8], (R(TS))
 	B	_f32loop
 
 _f4tail:
+	MOVW	savedte+4(SP), R(TE)
 	SUB	$3, R(TE), R(TMP)	/* do remaining words if possible */
 _f4loop:
 	CMP	R(TMP), R(TS)
 	BHS	_f1tail
 
 	MOVW.P	4(R(FROM)), R(TMP1)	/* implicit write back */
-	MOVW.P	R4, 4(R(TS))		/* implicit write back */
+	MOVW.P	R(TMP1), 4(R(TS))	/* implicit write back */
 	B	_f4loop
 
 _f1tail:
@@ -134,25 +159,9 @@ _f1tail:
 	B	_f1tail
 
 _return:
-	// restore g and m
-	MOVW	4(R13), R9
-	MOVW	8(R13), R10
 	MOVW	to+0(FP), R0
 	RET
 
-RSHIFT = 4
-LSHIFT = 5
-OFFSET = 6
-
-BR0 = 7
-BW0 = 8
-BR1 = 8
-BW1 = 9
-BR2 = 9
-BW2 = 10
-BR3 = 10
-BW3 = 11
-
 _bunaligned:
 	CMP	$2, R(TMP)		/* is R(TMP) < 2 ? */
 
@@ -172,7 +181,8 @@ _bunaligned:
 	CMP	R(TMP), R(TE)
 	BLS	_b1tail
 
-	AND	$~0x03, R(FROM)		/* align source */
+	BIC	$3, R(FROM)		/* align source */
+	MOVW	R(TS), savedts+4(SP)
 	MOVW	(R(FROM)), R(BR0)	/* prime first block register */
 
 _bu16loop:
@@ -196,18 +206,10 @@ _bu16loop:
 	B	_bu16loop
 
 _bu1tail:
+	MOVW	savedts+4(SP), R(TS)
 	ADD	R(OFFSET), R(FROM)
 	B	_b1tail
 
-FW0 = 7
-FR0 = 8
-FW1 = 8
-FR1 = 9
-FW2 = 9
-FR2 = 10
-FW3 = 10
-FR3 = 11
-
 _funaligned:
 	CMP	$2, R(TMP)
 
@@ -227,7 +229,8 @@ _funaligned:
 	CMP	R(TMP), R(TS)
 	BHS	_f1tail
 
-	AND	$~0x03, R(FROM)		/* align source */
+	BIC	$3, R(FROM)		/* align source */
+	MOVW	R(TE), savedte+4(SP)
 	MOVW.P	4(R(FROM)), R(FR3)	/* prime last block register, implicit write back */
 
 _fu16loop:
@@ -235,7 +238,7 @@ _fu16loop:
 	BHS	_fu1tail
 
 	MOVW	R(FR3)>>R(RSHIFT), R(FW0)
-	MOVM.IA.W (R(FROM)), [R(FR0)-R(FR3)]
+	MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)]
 	ORR	R(FR0)<<R(LSHIFT), R(FW0)
 
 	MOVW	R(FR0)>>R(RSHIFT), R(FW1)
@@ -247,9 +250,10 @@ _fu16loop:
 	MOVW	R(FR2)>>R(RSHIFT), R(FW3)
 	ORR	R(FR3)<<R(LSHIFT), R(FW3)
 
-	MOVM.IA.W [R(FW0)-R(FW3)], (R(TS))
+	MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS))
 	B	_fu16loop
 
 _fu1tail:
+	MOVW	savedte+4(SP), R(TE)
 	SUB	R(OFFSET), R(FROM)
 	B	_f1tail
diff --git a/src/pkg/runtime/mfinal.c b/src/pkg/runtime/mfinal.c
index 1fa5ea401..2f5e4277d 100644
--- a/src/pkg/runtime/mfinal.c
+++ b/src/pkg/runtime/mfinal.c
@@ -11,8 +11,8 @@ enum { debug = 0 };
 typedef struct Fin Fin;
 struct Fin
 {
-	void (*fn)(void*);
-	int32 nret;
+	FuncVal *fn;
+	uintptr nret;
 };
 
 // Finalizer hash table.  Direct hash, linear scan, at most 3/4 full.
@@ -42,7 +42,7 @@ static struct {
 } fintab[TABSZ];
 
 static void
-addfintab(Fintab *t, void *k, void (*fn)(void*), int32 nret)
+addfintab(Fintab *t, void *k, FuncVal *fn, uintptr nret)
 {
 	int32 i, j;
 
@@ -137,7 +137,7 @@ resizefintab(Fintab *tab)
 }
 
 bool
-runtime·addfinalizer(void *p, void (*f)(void*), int32 nret)
+runtime·addfinalizer(void *p, FuncVal *f, uintptr nret)
 {
 	Fintab *tab;
 	byte *base;
@@ -175,7 +175,7 @@ runtime·addfinalizer(void *p, void (*f)(void*), int32 nret)
 // get finalizer; if del, delete finalizer.
 // caller is responsible for updating RefHasFinalizer (special) bit.
 bool
-runtime·getfinalizer(void *p, bool del, void (**fn)(void*), int32 *nret)
+runtime·getfinalizer(void *p, bool del, FuncVal **fn, uintptr *nret)
 {
 	Fintab *tab;
 	bool res;
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index e8fb266f4..010f9cd96 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -8,15 +8,28 @@
 #include "arch_GOARCH.h"
 #include "malloc.h"
 #include "stack.h"
+#include "mgc0.h"
+#include "race.h"
+#include "type.h"
+#include "typekind.h"
+#include "hashmap.h"
 
 enum {
 	Debug = 0,
-	PtrSize = sizeof(void*),
 	DebugMark = 0,  // run second pass to check mark
+	CollectStats = 0,
 
 	// Four bits per word (see #defines below).
 	wordsPerBitmapWord = sizeof(void*)*8/4,
 	bitShift = sizeof(void*)*8/4,
+
+	handoffThreshold = 4,
+	IntermediateBufferCapacity = 64,
+
+	// Bits in type information
+	PRECISE = 1,
+	LOOP = 2,
+	PC_BITS = PRECISE | LOOP,
 };
 
 // Bits in per-word bitmap.
@@ -67,25 +80,34 @@ enum {
 //
 uint32 runtime·worldsema = 1;
 
-// TODO: Make these per-M.
-static uint64 nhandoff;
-
 static int32 gctrace;
 
+typedef struct Obj Obj;
+struct Obj
+{
+	byte	*p;	// data pointer
+	uintptr	n;	// size of data in bytes
+	uintptr	ti;	// type info
+};
+
+// The size of Workbuf is N*PageSize.
 typedef struct Workbuf Workbuf;
 struct Workbuf
 {
-	Workbuf *next;
+#define SIZE (2*PageSize-sizeof(LFNode)-sizeof(uintptr))
+	LFNode  node; // must be first
 	uintptr nobj;
-	byte *obj[512-2];
+	Obj     obj[SIZE/sizeof(Obj) - 1];
+	uint8   _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
+#undef SIZE
 };
 
 typedef struct Finalizer Finalizer;
 struct Finalizer
 {
-	void (*fn)(void*);
+	FuncVal *fn;
 	void *arg;
-	int32 nret;
+	uintptr nret;
 };
 
 typedef struct FinBlock FinBlock;
@@ -99,9 +121,13 @@ struct FinBlock
 };
 
 extern byte data[];
-extern byte etext[];
+extern byte edata[];
+extern byte bss[];
 extern byte ebss[];
 
+extern byte gcdata[];
+extern byte gcbss[];
+
 static G *fing;
 static FinBlock *finq; // list of finalizers that are to be executed
 static FinBlock *finc; // cache of free blocks
@@ -116,89 +142,244 @@ static void	putempty(Workbuf*);
 static Workbuf* handoff(Workbuf*);
 
 static struct {
-	Lock fmu;
-	Workbuf	*full;
-	Lock emu;
-	Workbuf	*empty;
+	uint64	full;  // lock-free list of full blocks
+	uint64	empty; // lock-free list of empty blocks
+	byte	pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
 	uint32	nproc;
 	volatile uint32	nwait;
 	volatile uint32	ndone;
+	volatile uint32 debugmarkdone;
 	Note	alldone;
-	Lock	markgate;
-	Lock	sweepgate;
-	MSpan	*spans;
+	ParFor	*markfor;
+	ParFor	*sweepfor;
 
 	Lock;
 	byte	*chunk;
 	uintptr	nchunk;
+
+	Obj	*roots;
+	uint32	nroot;
+	uint32	rootcap;
 } work;
 
-// scanblock scans a block of n bytes starting at pointer b for references
-// to other objects, scanning any it finds recursively until there are no
-// unscanned objects left.  Instead of using an explicit recursion, it keeps
-// a work list in the Workbuf* structures and loops in the main function
-// body.  Keeping an explicit work list is easier on the stack allocator and
-// more efficient.
+enum {
+	GC_DEFAULT_PTR = GC_NUM_INSTR,
+	GC_MAP_NEXT,
+	GC_CHAN,
+
+	GC_NUM_INSTR2
+};
+
+static struct {
+	struct {
+		uint64 sum;
+		uint64 cnt;
+	} ptr;
+	uint64 nbytes;
+	struct {
+		uint64 sum;
+		uint64 cnt;
+		uint64 notype;
+		uint64 typelookup;
+	} obj;
+	uint64 rescan;
+	uint64 rescanbytes;
+	uint64 instr[GC_NUM_INSTR2];
+	uint64 putempty;
+	uint64 getfull;
+} gcstats;
+
+// markonly marks an object. It returns true if the object
+// has been marked by this function, false otherwise.
+// This function isn't thread-safe and doesn't append the object to any buffer.
+static bool
+markonly(void *obj)
+{
+	byte *p;
+	uintptr *bitp, bits, shift, x, xbits, off;
+	MSpan *s;
+	PageID k;
+
+	// Words outside the arena cannot be pointers.
+	if(obj < runtime·mheap->arena_start || obj >= runtime·mheap->arena_used)
+		return false;
+
+	// obj may be a pointer to a live object.
+	// Try to find the beginning of the object.
+
+	// Round down to word boundary.
+	obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+
+	// Find bits for this word.
+	off = (uintptr*)obj - (uintptr*)runtime·mheap->arena_start;
+	bitp = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+	xbits = *bitp;
+	bits = xbits >> shift;
+
+	// Pointing at the beginning of a block?
+	if((bits & (bitAllocated|bitBlockBoundary)) != 0)
+		goto found;
+
+	// Otherwise consult span table to find beginning.
+	// (Manually inlined copy of MHeap_LookupMaybe.)
+	k = (uintptr)obj>>PageShift;
+	x = k;
+	if(sizeof(void*) == 8)
+		x -= (uintptr)runtime·mheap->arena_start>>PageShift;
+	s = runtime·mheap->map[x];
+	if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
+		return false;
+	p = (byte*)((uintptr)s->start<<PageShift);
+	if(s->sizeclass == 0) {
+		obj = p;
+	} else {
+		if((byte*)obj >= (byte*)s->limit)
+			return false;
+		uintptr size = s->elemsize;
+		int32 i = ((byte*)obj - p)/size;
+		obj = p+i*size;
+	}
+
+	// Now that we know the object header, reload bits.
+	off = (uintptr*)obj - (uintptr*)runtime·mheap->arena_start;
+	bitp = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
+	shift = off % wordsPerBitmapWord;
+	xbits = *bitp;
+	bits = xbits >> shift;
+
+found:
+	// Now we have bits, bitp, and shift correct for
+	// obj pointing at the base of the object.
+	// Only care about allocated and not marked.
+	if((bits & (bitAllocated|bitMarked)) != bitAllocated)
+		return false;
+	*bitp |= bitMarked<<shift;
+
+	// The object is now marked
+	return true;
+}
+
+// PtrTarget and BitTarget are structures used by intermediate buffers.
+// The intermediate buffers hold GC data before it
+// is moved/flushed to the work buffer (Workbuf).
+// The size of an intermediate buffer is very small,
+// such as 32 or 64 elements.
+typedef struct PtrTarget PtrTarget;
+struct PtrTarget
+{
+	void *p;
+	uintptr ti;
+};
+
+typedef struct BitTarget BitTarget;
+struct BitTarget
+{
+	void *p;
+	uintptr ti;
+	uintptr *bitp, shift;
+};
+
+typedef struct BufferList BufferList;
+struct BufferList
+{
+	PtrTarget ptrtarget[IntermediateBufferCapacity];
+	BitTarget bittarget[IntermediateBufferCapacity];
+	Obj obj[IntermediateBufferCapacity];
+	BufferList *next;
+};
+static BufferList *bufferList;
+
+static Lock lock;
+static Type *itabtype;
+
+static void enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj);
+
+// flushptrbuf moves data from the PtrTarget buffer to the work buffer.
+// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
+// while the work buffer contains blocks which have been marked
+// and are prepared to be scanned by the garbage collector.
+//
+// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
+// bitbuf holds temporary data generated by this function.
+//
+// A simplified drawing explaining how the todo-list moves from a structure to another:
+//
+//     scanblock
+//  (find pointers)
+//    Obj ------> PtrTarget (pointer targets)
+//     ↑          |
+//     |          | flushptrbuf (1st part,
+//     |          | find block start)
+//     |          ↓
+//     `--------- BitTarget (pointer targets and the corresponding locations in bitmap)
+//  flushptrbuf
+//  (2nd part, mark and enqueue)
 static void
-scanblock(byte *b, int64 n)
+flushptrbuf(PtrTarget *ptrbuf, PtrTarget **ptrbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj, BitTarget *bitbuf)
 {
-	byte *obj, *arena_start, *arena_used, *p;
-	void **vp;
-	uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc;
+	byte *p, *arena_start, *obj;
+	uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti, n;
 	MSpan *s;
 	PageID k;
-	void **wp;
+	Obj *wp;
 	Workbuf *wbuf;
-	bool keepworking;
+	PtrTarget *ptrbuf_end;
+	BitTarget *bitbufpos, *bt;
 
-	if((int64)(uintptr)n != n || n < 0) {
-		runtime·printf("scanblock %p %D\n", b, n);
-		runtime·throw("scanblock");
-	}
+	arena_start = runtime·mheap->arena_start;
 
-	// Memory arena parameters.
-	arena_start = runtime·mheap.arena_start;
-	arena_used = runtime·mheap.arena_used;
-	nproc = work.nproc;
+	wp = *_wp;
+	wbuf = *_wbuf;
+	nobj = *_nobj;
 
-	wbuf = nil;  // current work buffer
-	wp = nil;  // storage for next queued pointer (write pointer)
-	nobj = 0;  // number of queued objects
+	ptrbuf_end = *ptrbufpos;
+	n = ptrbuf_end - ptrbuf;
+	*ptrbufpos = ptrbuf;
 
-	// Scanblock helpers pass b==nil.
-	// The main proc needs to return to make more
-	// calls to scanblock.  But if work.nproc==1 then
-	// might as well process blocks as soon as we
-	// have them.
-	keepworking = b == nil || work.nproc == 1;
+	if(CollectStats) {
+		runtime·xadd64(&gcstats.ptr.sum, n);
+		runtime·xadd64(&gcstats.ptr.cnt, 1);
+	}
 
-	// Align b to a word boundary.
-	off = (uintptr)b & (PtrSize-1);
-	if(off != 0) {
-		b += PtrSize - off;
-		n -= PtrSize - off;
+	// If buffer is nearly full, get a new one.
+	if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
+		if(wbuf != nil)
+			wbuf->nobj = nobj;
+		wbuf = getempty(wbuf);
+		wp = wbuf->obj;
+		nobj = 0;
+
+		if(n >= nelem(wbuf->obj))
+			runtime·throw("ptrbuf has to be smaller than WorkBuf");
 	}
 
-	for(;;) {
-		// Each iteration scans the block b of length n, queueing pointers in
-		// the work buffer.
-		if(Debug > 1)
-			runtime·printf("scanblock %p %D\n", b, n);
+	// TODO(atom): This block is a branch of an if-then-else statement.
+	//             The single-threaded branch may be added in a next CL.
+	{
+		// Multi-threaded version.
 
-		vp = (void**)b;
-		n >>= (2+PtrSize/8);  /* n /= PtrSize (4 or 8) */
-		for(i=0; i<n; i++) {
-			obj = (byte*)vp[i];
+		bitbufpos = bitbuf;
 
-			// Words outside the arena cannot be pointers.
-			if((byte*)obj < arena_start || (byte*)obj >= arena_used)
-				continue;
+		while(ptrbuf < ptrbuf_end) {
+			obj = ptrbuf->p;
+			ti = ptrbuf->ti;
+			ptrbuf++;
+
+			// obj belongs to interval [mheap.arena_start, mheap.arena_used).
+			if(Debug > 1) {
+				if(obj < runtime·mheap->arena_start || obj >= runtime·mheap->arena_used)
+					runtime·throw("object is outside of mheap");
+			}
 
 			// obj may be a pointer to a live object.
 			// Try to find the beginning of the object.
 
 			// Round down to word boundary.
-			obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+			if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
+				obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
+				ti = 0;
+			}
 
 			// Find bits for this word.
 			off = (uintptr*)obj - (uintptr*)arena_start;
@@ -211,6 +392,8 @@ scanblock(byte *b, int64 n)
 			if((bits & (bitAllocated|bitBlockBoundary)) != 0)
 				goto found;
 
+			ti = 0;
+
 			// Pointing just past the beginning?
 			// Scan backward a little to find a block boundary.
 			for(j=shift; j-->0; ) {
@@ -228,16 +411,16 @@ scanblock(byte *b, int64 n)
 			x = k;
 			if(sizeof(void*) == 8)
 				x -= (uintptr)arena_start>>PageShift;
-			s = runtime·mheap.map[x];
+			s = runtime·mheap->map[x];
 			if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
 				continue;
-			p =  (byte*)((uintptr)s->start<<PageShift);
+			p = (byte*)((uintptr)s->start<<PageShift);
 			if(s->sizeclass == 0) {
 				obj = p;
 			} else {
 				if((byte*)obj >= (byte*)s->limit)
 					continue;
-				size = runtime·class_to_size[s->sizeclass];
+				size = s->elemsize;
 				int32 i = ((byte*)obj - p)/size;
 				obj = p+i*size;
 			}
@@ -255,80 +438,606 @@ scanblock(byte *b, int64 n)
 			// Only care about allocated and not marked.
 			if((bits & (bitAllocated|bitMarked)) != bitAllocated)
 				continue;
-			if(nproc == 1)
-				*bitp |= bitMarked<<shift;
-			else {
-				for(;;) {
-					x = *bitp;
-					if(x & (bitMarked<<shift))
-						goto continue_obj;
-					if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
-						break;
-				}
-			}
+
+			*bitbufpos++ = (BitTarget){obj, ti, bitp, shift};
+		}
+
+		runtime·lock(&lock);
+		for(bt=bitbuf; bt<bitbufpos; bt++){
+			xbits = *bt->bitp;
+			bits = xbits >> bt->shift;
+			if((bits & bitMarked) != 0)
+				continue;
+
+			// Mark the block
+			*bt->bitp = xbits | (bitMarked << bt->shift);
 
 			// If object has no pointers, don't need to scan further.
 			if((bits & bitNoPointers) != 0)
 				continue;
 
-			// If another proc wants a pointer, give it some.
-			if(nobj > 4 && work.nwait > 0 && work.full == nil) {
+			obj = bt->p;
+
+			// Ask span about size class.
+			// (Manually inlined copy of MHeap_Lookup.)
+			x = (uintptr)obj >> PageShift;
+			if(sizeof(void*) == 8)
+				x -= (uintptr)arena_start>>PageShift;
+			s = runtime·mheap->map[x];
+
+			PREFETCH(obj);
+
+			*wp = (Obj){obj, s->elemsize, bt->ti};
+			wp++;
+			nobj++;
+		}
+		runtime·unlock(&lock);
+
+		// If another proc wants a pointer, give it some.
+		if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
+			wbuf->nobj = nobj;
+			wbuf = handoff(wbuf);
+			nobj = wbuf->nobj;
+			wp = wbuf->obj + nobj;
+		}
+	}
+
+	*_wp = wp;
+	*_wbuf = wbuf;
+	*_nobj = nobj;
+}
+
+static void
+flushobjbuf(Obj *objbuf, Obj **objbufpos, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj)
+{
+	uintptr nobj, off;
+	Obj *wp, obj;
+	Workbuf *wbuf;
+	Obj *objbuf_end;
+
+	wp = *_wp;
+	wbuf = *_wbuf;
+	nobj = *_nobj;
+
+	objbuf_end = *objbufpos;
+	*objbufpos = objbuf;
+
+	while(objbuf < objbuf_end) {
+		obj = *objbuf++;
+
+		// Align obj.b to a word boundary.
+		off = (uintptr)obj.p & (PtrSize-1);
+		if(off != 0) {
+			obj.p += PtrSize - off;
+			obj.n -= PtrSize - off;
+			obj.ti = 0;
+		}
+
+		if(obj.p == nil || obj.n == 0)
+			continue;
+
+		// If buffer is full, get a new one.
+		if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
+			if(wbuf != nil)
 				wbuf->nobj = nobj;
-				wbuf = handoff(wbuf);
-				nobj = wbuf->nobj;
-				wp = wbuf->obj + nobj;
+			wbuf = getempty(wbuf);
+			wp = wbuf->obj;
+			nobj = 0;
+		}
+
+		*wp = obj;
+		wp++;
+		nobj++;
+	}
+
+	// If another proc wants a pointer, give it some.
+	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
+		wbuf->nobj = nobj;
+		wbuf = handoff(wbuf);
+		nobj = wbuf->nobj;
+		wp = wbuf->obj + nobj;
+	}
+
+	*_wp = wp;
+	*_wbuf = wbuf;
+	*_nobj = nobj;
+}
+
+// Program that scans the whole block and treats every block element as a potential pointer
+static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
+
+// Hashmap iterator program
+static uintptr mapProg[2] = {0, GC_MAP_NEXT};
+
+// Hchan program
+static uintptr chanProg[2] = {0, GC_CHAN};
+
+// Local variables of a program fragment or loop
+typedef struct Frame Frame;
+struct Frame {
+	uintptr count, elemsize, b;
+	uintptr *loop_or_ret;
+};
+
+// scanblock scans a block of n bytes starting at pointer b for references
+// to other objects, scanning any it finds recursively until there are no
+// unscanned objects left.  Instead of using an explicit recursion, it keeps
+// a work list in the Workbuf* structures and loops in the main function
+// body.  Keeping an explicit work list is easier on the stack allocator and
+// more efficient.
+//
+// wbuf: current work buffer
+// wp:   storage for next queued pointer (write pointer)
+// nobj: number of queued objects
+static void
+scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
+{
+	byte *b, *arena_start, *arena_used;
+	uintptr n, i, end_b, elemsize, size, ti, objti, count, type;
+	uintptr *pc, precise_type, nominal_size;
+	uintptr *map_ret, mapkey_size, mapval_size, mapkey_ti, mapval_ti;
+	void *obj;
+	Type *t;
+	Slice *sliceptr;
+	Frame *stack_ptr, stack_top, stack[GC_STACK_CAPACITY+4];
+	BufferList *scanbuffers;
+	PtrTarget *ptrbuf, *ptrbuf_end, *ptrbufpos;
+	BitTarget *bitbuf;
+	Obj *objbuf, *objbuf_end, *objbufpos;
+	Eface *eface;
+	Iface *iface;
+	Hmap *hmap;
+	MapType *maptype;
+	bool didmark, mapkey_kind, mapval_kind;
+	struct hash_gciter map_iter;
+	struct hash_gciter_data d;
+	Hchan *chan;
+	ChanType *chantype;
+
+	if(sizeof(Workbuf) % PageSize != 0)
+		runtime·throw("scanblock: size of Workbuf is suboptimal");
+
+	// Memory arena parameters.
+	arena_start = runtime·mheap->arena_start;
+	arena_used = runtime·mheap->arena_used;
+
+	stack_ptr = stack+nelem(stack)-1;
+	
+	precise_type = false;
+	nominal_size = 0;
+
+	// Allocate ptrbuf, bitbuf
+	{
+		runtime·lock(&lock);
+
+		if(bufferList == nil) {
+			bufferList = runtime·SysAlloc(sizeof(*bufferList));
+			if(bufferList == nil)
+				runtime·throw("runtime: cannot allocate memory");
+			bufferList->next = nil;
+		}
+		scanbuffers = bufferList;
+		bufferList = bufferList->next;
+
+		ptrbuf = &scanbuffers->ptrtarget[0];
+		ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget);
+		bitbuf = &scanbuffers->bittarget[0];
+		objbuf = &scanbuffers->obj[0];
+		objbuf_end = &scanbuffers->obj[0] + nelem(scanbuffers->obj);
+
+		runtime·unlock(&lock);
+	}
+
+	ptrbufpos = ptrbuf;
+	objbufpos = objbuf;
+
+	// (Silence the compiler)
+	map_ret = nil;
+	mapkey_size = mapval_size = 0;
+	mapkey_kind = mapval_kind = false;
+	mapkey_ti = mapval_ti = 0;
+	chan = nil;
+	chantype = nil;
+
+	goto next_block;
+
+	for(;;) {
+		// Each iteration scans the block b of length n, queueing pointers in
+		// the work buffer.
+		if(Debug > 1) {
+			runtime·printf("scanblock %p %D\n", b, (int64)n);
+		}
+
+		if(CollectStats) {
+			runtime·xadd64(&gcstats.nbytes, n);
+			runtime·xadd64(&gcstats.obj.sum, nobj);
+			runtime·xadd64(&gcstats.obj.cnt, 1);
+		}
+
+		if(ti != 0) {
+			pc = (uintptr*)(ti & ~(uintptr)PC_BITS);
+			precise_type = (ti & PRECISE);
+			stack_top.elemsize = pc[0];
+			if(!precise_type)
+				nominal_size = pc[0];
+			if(ti & LOOP) {
+				stack_top.count = 0;	// 0 means an infinite number of iterations
+				stack_top.loop_or_ret = pc+1;
+			} else {
+				stack_top.count = 1;
 			}
+		} else if(UseSpanType) {
+			if(CollectStats)
+				runtime·xadd64(&gcstats.obj.notype, 1);
+
+			type = runtime·gettype(b);
+			if(type != 0) {
+				if(CollectStats)
+					runtime·xadd64(&gcstats.obj.typelookup, 1);
+
+				t = (Type*)(type & ~(uintptr)(PtrSize-1));
+				switch(type & (PtrSize-1)) {
+				case TypeInfo_SingleObject:
+					pc = (uintptr*)t->gc;
+					precise_type = true;  // type information about 'b' is precise
+					stack_top.count = 1;
+					stack_top.elemsize = pc[0];
+					break;
+				case TypeInfo_Array:
+					pc = (uintptr*)t->gc;
+					if(pc[0] == 0)
+						goto next_block;
+					precise_type = true;  // type information about 'b' is precise
+					stack_top.count = 0;  // 0 means an infinite number of iterations
+					stack_top.elemsize = pc[0];
+					stack_top.loop_or_ret = pc+1;
+					break;
+				case TypeInfo_Map:
+					hmap = (Hmap*)b;
+					maptype = (MapType*)t;
+					if(hash_gciter_init(hmap, &map_iter)) {
+						mapkey_size = maptype->key->size;
+						mapkey_kind = maptype->key->kind;
+						mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
+						mapval_size = maptype->elem->size;
+						mapval_kind = maptype->elem->kind;
+						mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
+
+						map_ret = 0;
+						pc = mapProg;
+					} else {
+						goto next_block;
+					}
+					break;
+				case TypeInfo_Chan:
+					chan = (Hchan*)b;
+					chantype = (ChanType*)t;
+					pc = chanProg;
+					break;
+				default:
+					runtime·throw("scanblock: invalid type");
+					return;
+				}
+			} else {
+				pc = defaultProg;
+			}
+		} else {
+			pc = defaultProg;
+		}
+
+		pc++;
+		stack_top.b = (uintptr)b;
 
-			// If buffer is full, get a new one.
-			if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
-				if(wbuf != nil)
-					wbuf->nobj = nobj;
-				wbuf = getempty(wbuf);
-				wp = wbuf->obj;
-				nobj = 0;
+		end_b = (uintptr)b + n - PtrSize;
+
+	for(;;) {
+		if(CollectStats)
+			runtime·xadd64(&gcstats.instr[pc[0]], 1);
+
+		obj = nil;
+		objti = 0;
+		switch(pc[0]) {
+		case GC_PTR:
+			obj = *(void**)(stack_top.b + pc[1]);
+			objti = pc[2];
+			pc += 3;
+			break;
+
+		case GC_SLICE:
+			sliceptr = (Slice*)(stack_top.b + pc[1]);
+			if(sliceptr->cap != 0) {
+				obj = sliceptr->array;
+				objti = pc[2] | PRECISE | LOOP;
 			}
-			*wp++ = obj;
-			nobj++;
-		continue_obj:;
+			pc += 3;
+			break;
+
+		case GC_APTR:
+			obj = *(void**)(stack_top.b + pc[1]);
+			pc += 2;
+			break;
+
+		case GC_STRING:
+			obj = *(void**)(stack_top.b + pc[1]);
+			pc += 2;
+			break;
+
+		case GC_EFACE:
+			eface = (Eface*)(stack_top.b + pc[1]);
+			pc += 2;
+			if(eface->type != nil && (eface->data >= arena_start && eface->data < arena_used)) {
+				t = eface->type;
+				if(t->size <= sizeof(void*)) {
+					if((t->kind & KindNoPointers))
+						break;
+
+					obj = eface->data;
+					if((t->kind & ~KindNoPointers) == KindPtr)
+						objti = (uintptr)((PtrType*)t)->elem->gc;
+				} else {
+					obj = eface->data;
+					objti = (uintptr)t->gc;
+				}
+			}
+			break;
+
+		case GC_IFACE:
+			iface = (Iface*)(stack_top.b + pc[1]);
+			pc += 2;
+			if(iface->tab == nil)
+				break;
+			
+			// iface->tab
+			if((void*)iface->tab >= arena_start && (void*)iface->tab < arena_used) {
+				*ptrbufpos++ = (PtrTarget){iface->tab, (uintptr)itabtype->gc};
+				if(ptrbufpos == ptrbuf_end)
+					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+			}
+
+			// iface->data
+			if(iface->data >= arena_start && iface->data < arena_used) {
+				t = iface->tab->type;
+				if(t->size <= sizeof(void*)) {
+					if((t->kind & KindNoPointers))
+						break;
+
+					obj = iface->data;
+					if((t->kind & ~KindNoPointers) == KindPtr)
+						objti = (uintptr)((PtrType*)t)->elem->gc;
+				} else {
+					obj = iface->data;
+					objti = (uintptr)t->gc;
+				}
+			}
+			break;
+
+		case GC_DEFAULT_PTR:
+			while((i = stack_top.b) <= end_b) {
+				stack_top.b += PtrSize;
+				obj = *(byte**)i;
+				if(obj >= arena_start && obj < arena_used) {
+					*ptrbufpos++ = (PtrTarget){obj, 0};
+					if(ptrbufpos == ptrbuf_end)
+						flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+				}
+			}
+			goto next_block;
+
+		case GC_END:
+			if(--stack_top.count != 0) {
+				// Next iteration of a loop if possible.
+				elemsize = stack_top.elemsize;
+				stack_top.b += elemsize;
+				if(stack_top.b + elemsize <= end_b+PtrSize) {
+					pc = stack_top.loop_or_ret;
+					continue;
+				}
+				i = stack_top.b;
+			} else {
+				// Stack pop if possible.
+				if(stack_ptr+1 < stack+nelem(stack)) {
+					pc = stack_top.loop_or_ret;
+					stack_top = *(++stack_ptr);
+					continue;
+				}
+				i = (uintptr)b + nominal_size;
+			}
+			if(!precise_type) {
+				// Quickly scan [b+i,b+n) for possible pointers.
+				for(; i<=end_b; i+=PtrSize) {
+					if(*(byte**)i != nil) {
+						// Found a value that may be a pointer.
+						// Do a rescan of the entire block.
+						enqueue((Obj){b, n, 0}, &wbuf, &wp, &nobj);
+						if(CollectStats) {
+							runtime·xadd64(&gcstats.rescan, 1);
+							runtime·xadd64(&gcstats.rescanbytes, n);
+						}
+						break;
+					}
+				}
+			}
+			goto next_block;
+
+		case GC_ARRAY_START:
+			i = stack_top.b + pc[1];
+			count = pc[2];
+			elemsize = pc[3];
+			pc += 4;
+
+			// Stack push.
+			*stack_ptr-- = stack_top;
+			stack_top = (Frame){count, elemsize, i, pc};
+			continue;
+
+		case GC_ARRAY_NEXT:
+			if(--stack_top.count != 0) {
+				stack_top.b += stack_top.elemsize;
+				pc = stack_top.loop_or_ret;
+			} else {
+				// Stack pop.
+				stack_top = *(++stack_ptr);
+				pc += 1;
+			}
+			continue;
+
+		case GC_CALL:
+			// Stack push.
+			*stack_ptr-- = stack_top;
+			stack_top = (Frame){1, 0, stack_top.b + pc[1], pc+3 /*return address*/};
+			pc = (uintptr*)((byte*)pc + *(int32*)(pc+2));  // target of the CALL instruction
+			continue;
+
+		case GC_MAP_PTR:
+			hmap = *(Hmap**)(stack_top.b + pc[1]);
+			if(hmap == nil) {
+				pc += 3;
+				continue;
+			}
+			runtime·lock(&lock);
+			didmark = markonly(hmap);
+			runtime·unlock(&lock);
+			if(didmark) {
+				maptype = (MapType*)pc[2];
+				if(hash_gciter_init(hmap, &map_iter)) {
+					mapkey_size = maptype->key->size;
+					mapkey_kind = maptype->key->kind;
+					mapkey_ti   = (uintptr)maptype->key->gc | PRECISE;
+					mapval_size = maptype->elem->size;
+					mapval_kind = maptype->elem->kind;
+					mapval_ti   = (uintptr)maptype->elem->gc | PRECISE;
+
+					// Start mapProg.
+					map_ret = pc+3;
+					pc = mapProg+1;
+				} else {
+					pc += 3;
+				}
+			} else {
+				pc += 3;
+			}
+			continue;
+
+		case GC_MAP_NEXT:
+			// Add all keys and values to buffers, mark all subtables.
+			while(hash_gciter_next(&map_iter, &d)) {
+				// buffers: reserve space for 2 objects.
+				if(ptrbufpos+2 >= ptrbuf_end)
+					flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+				if(objbufpos+2 >= objbuf_end)
+					flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+
+				if(d.st != nil) {
+					runtime·lock(&lock);
+					markonly(d.st);
+					runtime·unlock(&lock);
+				}
+				if(d.key_data != nil) {
+					if(!(mapkey_kind & KindNoPointers) || d.indirectkey) {
+						if(!d.indirectkey)
+							*objbufpos++ = (Obj){d.key_data, mapkey_size, mapkey_ti};
+						else
+							*ptrbufpos++ = (PtrTarget){*(void**)d.key_data, mapkey_ti};
+					}
+					if(!(mapval_kind & KindNoPointers) || d.indirectval) {
+						if(!d.indirectval)
+							*objbufpos++ = (Obj){d.val_data, mapval_size, mapval_ti};
+						else
+							*ptrbufpos++ = (PtrTarget){*(void**)d.val_data, mapval_ti};
+					}
+				}
+			}
+			if(map_ret == 0)
+				goto next_block;
+			pc = map_ret;
+			continue;
+
+		case GC_REGION:
+			obj = (void*)(stack_top.b + pc[1]);
+			size = pc[2];
+			objti = pc[3];
+			pc += 4;
+
+			*objbufpos++ = (Obj){obj, size, objti};
+			if(objbufpos == objbuf_end)
+				flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+			break;
+
+		case GC_CHAN:
+			// There are no heap pointers in struct Hchan,
+			// so we can ignore the leading sizeof(Hchan) bytes.
+			if(!(chantype->elem->kind & KindNoPointers)) {
+				// Channel's buffer follows Hchan immediately in memory.
+				// Size of buffer (cap(c)) is second int in the chan struct.
+				n = ((uintgo*)chan)[1];
+				if(n > 0) {
+					// TODO(atom): split into two chunks so that only the
+					// in-use part of the circular buffer is scanned.
+					// (Channel routines zero the unused part, so the current
+					// code does not lead to leaks, it's just a little inefficient.)
+					*objbufpos++ = (Obj){(byte*)chan+runtime·Hchansize, n*chantype->elem->size,
+						(uintptr)chantype->elem->gc | PRECISE | LOOP};
+					if(objbufpos == objbuf_end)
+						flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+				}
+			}
+			goto next_block;
+
+		default:
+			runtime·throw("scanblock: invalid GC instruction");
+			return;
 		}
 
+		if(obj >= arena_start && obj < arena_used) {
+			*ptrbufpos++ = (PtrTarget){obj, objti};
+			if(ptrbufpos == ptrbuf_end)
+				flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+		}
+	}
+
+	next_block:
 		// Done scanning [b, b+n).  Prepare for the next iteration of
-		// the loop by setting b and n to the parameters for the next block.
+		// the loop by setting b, n, ti to the parameters for the next block.
 
-		// Fetch b from the work buffer.
 		if(nobj == 0) {
-			if(!keepworking) {
-				putempty(wbuf);
-				return;
+			flushptrbuf(ptrbuf, &ptrbufpos, &wp, &wbuf, &nobj, bitbuf);
+			flushobjbuf(objbuf, &objbufpos, &wp, &wbuf, &nobj);
+
+			if(nobj == 0) {
+				if(!keepworking) {
+					if(wbuf)
+						putempty(wbuf);
+					goto endscan;
+				}
+				// Emptied our buffer: refill.
+				wbuf = getfull(wbuf);
+				if(wbuf == nil)
+					goto endscan;
+				nobj = wbuf->nobj;
+				wp = wbuf->obj + wbuf->nobj;
 			}
-			// Emptied our buffer: refill.
-			wbuf = getfull(wbuf);
-			if(wbuf == nil)
-				return;
-			nobj = wbuf->nobj;
-			wp = wbuf->obj + wbuf->nobj;
 		}
-		b = *--wp;
-		nobj--;
 
-		// Ask span about size class.
-		// (Manually inlined copy of MHeap_Lookup.)
-		x = (uintptr)b>>PageShift;
-		if(sizeof(void*) == 8)
-			x -= (uintptr)arena_start>>PageShift;
-		s = runtime·mheap.map[x];
-		if(s->sizeclass == 0)
-			n = s->npages<<PageShift;
-		else
-			n = runtime·class_to_size[s->sizeclass];
+		// Fetch b from the work buffer.
+		--wp;
+		b = wp->p;
+		n = wp->n;
+		ti = wp->ti;
+		nobj--;
 	}
+
+endscan:
+	runtime·lock(&lock);
+	scanbuffers->next = bufferList;
+	bufferList = scanbuffers;
+	runtime·unlock(&lock);
 }
 
 // debug_scanblock is the debug copy of scanblock.
 // it is simpler, slower, single-threaded, recursive,
 // and uses bitSpecial as the mark bit.
 static void
-debug_scanblock(byte *b, int64 n)
+debug_scanblock(byte *b, uintptr n)
 {
 	byte *obj, *p;
 	void **vp;
@@ -338,8 +1047,8 @@ debug_scanblock(byte *b, int64 n)
 	if(!DebugMark)
 		runtime·throw("debug_scanblock without DebugMark");
 
-	if((int64)(uintptr)n != n || n < 0) {
-		runtime·printf("debug_scanblock %p %D\n", b, n);
+	if((intptr)n < 0) {
+		runtime·printf("debug_scanblock %p %D\n", b, (int64)n);
 		runtime·throw("debug_scanblock");
 	}
 
@@ -356,33 +1065,31 @@ debug_scanblock(byte *b, int64 n)
 		obj = (byte*)vp[i];
 
 		// Words outside the arena cannot be pointers.
-		if((byte*)obj < runtime·mheap.arena_start || (byte*)obj >= runtime·mheap.arena_used)
+		if((byte*)obj < runtime·mheap->arena_start || (byte*)obj >= runtime·mheap->arena_used)
 			continue;
 
 		// Round down to word boundary.
 		obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
 
 		// Consult span table to find beginning.
-		s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj);
+		s = runtime·MHeap_LookupMaybe(runtime·mheap, obj);
 		if(s == nil)
 			continue;
 
-
 		p =  (byte*)((uintptr)s->start<<PageShift);
+		size = s->elemsize;
 		if(s->sizeclass == 0) {
 			obj = p;
-			size = (uintptr)s->npages<<PageShift;
 		} else {
 			if((byte*)obj >= (byte*)s->limit)
 				continue;
-			size = runtime·class_to_size[s->sizeclass];
 			int32 i = ((byte*)obj - p)/size;
 			obj = p+i*size;
 		}
 
 		// Now that we know the object header, reload bits.
-		off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;
-		bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+		off = (uintptr*)obj - (uintptr*)runtime·mheap->arena_start;
+		bitp = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
 		xbits = *bitp;
 		bits = xbits >> shift;
@@ -404,53 +1111,98 @@ debug_scanblock(byte *b, int64 n)
 	}
 }
 
+// Append obj to the work buffer.
+// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
+static void
+enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
+{
+	uintptr nobj, off;
+	Obj *wp;
+	Workbuf *wbuf;
+
+	if(Debug > 1)
+		runtime·printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);
+
+	// Align obj.b to a word boundary.
+	off = (uintptr)obj.p & (PtrSize-1);
+	if(off != 0) {
+		obj.p += PtrSize - off;
+		obj.n -= PtrSize - off;
+		obj.ti = 0;
+	}
+
+	if(obj.p == nil || obj.n == 0)
+		return;
+
+	// Load work buffer state
+	wp = *_wp;
+	wbuf = *_wbuf;
+	nobj = *_nobj;
+
+	// If another proc wants a pointer, give it some.
+	if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
+		wbuf->nobj = nobj;
+		wbuf = handoff(wbuf);
+		nobj = wbuf->nobj;
+		wp = wbuf->obj + nobj;
+	}
+
+	// If buffer is full, get a new one.
+	if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
+		if(wbuf != nil)
+			wbuf->nobj = nobj;
+		wbuf = getempty(wbuf);
+		wp = wbuf->obj;
+		nobj = 0;
+	}
+
+	*wp = obj;
+	wp++;
+	nobj++;
+
+	// Save work buffer state
+	*_wp = wp;
+	*_wbuf = wbuf;
+	*_nobj = nobj;
+}
+
+static void
+markroot(ParFor *desc, uint32 i)
+{
+	Obj *wp;
+	Workbuf *wbuf;
+	uintptr nobj;
+
+	USED(&desc);
+	wp = nil;
+	wbuf = nil;
+	nobj = 0;
+	enqueue(work.roots[i], &wbuf, &wp, &nobj);
+	scanblock(wbuf, wp, nobj, false);
+}
+
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 static Workbuf*
 getempty(Workbuf *b)
 {
-	if(work.nproc == 1) {
-		// Put b on full list.
-		if(b != nil) {
-			b->next = work.full;
-			work.full = b;
+	if(b != nil)
+		runtime·lfstackpush(&work.full, &b->node);
+	b = (Workbuf*)runtime·lfstackpop(&work.empty);
+	if(b == nil) {
+		// Need to allocate.
+		runtime·lock(&work);
+		if(work.nchunk < sizeof *b) {
+			work.nchunk = 1<<20;
+			work.chunk = runtime·SysAlloc(work.nchunk);
+			if(work.chunk == nil)
+				runtime·throw("runtime: cannot allocate memory");
 		}
-		// Grab from empty list if possible.
-		b = work.empty;
-		if(b != nil) {
-			work.empty = b->next;
-			goto haveb;
-		}
-	} else {
-		// Put b on full list.
-		if(b != nil) {
-			runtime·lock(&work.fmu);
-			b->next = work.full;
-			work.full = b;
-			runtime·unlock(&work.fmu);
-		}
-		// Grab from empty list if possible.
-		runtime·lock(&work.emu);
-		b = work.empty;
-		if(b != nil)
-			work.empty = b->next;
-		runtime·unlock(&work.emu);
-		if(b != nil)
-			goto haveb;
+		b = (Workbuf*)work.chunk;
+		work.chunk += sizeof *b;
+		work.nchunk -= sizeof *b;
+		runtime·unlock(&work);
 	}
-
-	// Need to allocate.
-	runtime·lock(&work);
-	if(work.nchunk < sizeof *b) {
-		work.nchunk = 1<<20;
-		work.chunk = runtime·SysAlloc(work.nchunk);
-	}
-	b = (Workbuf*)work.chunk;
-	work.chunk += sizeof *b;
-	work.nchunk -= sizeof *b;
-	runtime·unlock(&work);
-
-haveb:
 	b->nobj = 0;
 	return b;
 }
@@ -458,19 +1210,10 @@ haveb:
 static void
 putempty(Workbuf *b)
 {
-	if(b == nil)
-		return;
+	if(CollectStats)
+		runtime·xadd64(&gcstats.putempty, 1);
 
-	if(work.nproc == 1) {
-		b->next = work.empty;
-		work.empty = b;
-		return;
-	}
-
-	runtime·lock(&work.emu);
-	b->next = work.empty;
-	work.empty = b;
-	runtime·unlock(&work.emu);
+	runtime·lfstackpush(&work.empty, &b->node);
 }
 
 // Get a full work buffer off the work.full list, or return nil.
@@ -478,63 +1221,37 @@ static Workbuf*
 getfull(Workbuf *b)
 {
 	int32 i;
-	Workbuf *b1;
-
-	if(work.nproc == 1) {
-		// Put b on empty list.
-		if(b != nil) {
-			b->next = work.empty;
-			work.empty = b;
-		}
-		// Grab from full list if possible.
-		// Since work.nproc==1, no one else is
-		// going to give us work.
-		b = work.full;
-		if(b != nil)
-			work.full = b->next;
-		return b;
-	}
 
-	putempty(b);
+	if(CollectStats)
+		runtime·xadd64(&gcstats.getfull, 1);
 
-	// Grab buffer from full list if possible.
-	for(;;) {
-		b1 = work.full;
-		if(b1 == nil)
-			break;
-		runtime·lock(&work.fmu);
-		if(work.full != nil) {
-			b1 = work.full;
-			work.full = b1->next;
-			runtime·unlock(&work.fmu);
-			return b1;
-		}
-		runtime·unlock(&work.fmu);
-	}
+	if(b != nil)
+		runtime·lfstackpush(&work.empty, &b->node);
+	b = (Workbuf*)runtime·lfstackpop(&work.full);
+	if(b != nil || work.nproc == 1)
+		return b;
 
 	runtime·xadd(&work.nwait, +1);
 	for(i=0;; i++) {
-		b1 = work.full;
-		if(b1 != nil) {
-			runtime·lock(&work.fmu);
-			if(work.full != nil) {
-				runtime·xadd(&work.nwait, -1);
-				b1 = work.full;
-				work.full = b1->next;
-				runtime·unlock(&work.fmu);
-				return b1;
-			}
-			runtime·unlock(&work.fmu);
-			continue;
+		if(work.full != 0) {
+			runtime·xadd(&work.nwait, -1);
+			b = (Workbuf*)runtime·lfstackpop(&work.full);
+			if(b != nil)
+				return b;
+			runtime·xadd(&work.nwait, +1);
 		}
 		if(work.nwait == work.nproc)
 			return nil;
-		if(i < 10)
+		if(i < 10) {
+			m->gcstats.nprocyield++;
 			runtime·procyield(20);
-		else if(i < 20)
+		} else if(i < 20) {
+			m->gcstats.nosyield++;
 			runtime·osyield();
-		else
+		} else {
+			m->gcstats.nsleep++;
 			runtime·usleep(100);
+		}
 	}
 }
 
@@ -550,20 +1267,40 @@ handoff(Workbuf *b)
 	b->nobj -= n;
 	b1->nobj = n;
 	runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]);
-	nhandoff += n;
+	m->gcstats.nhandoff++;
+	m->gcstats.nhandoffcnt += n;
 
 	// Put b on full list - let first half of b get stolen.
-	runtime·lock(&work.fmu);
-	b->next = work.full;
-	work.full = b;
-	runtime·unlock(&work.fmu);
-
+	runtime·lfstackpush(&work.full, &b->node);
 	return b1;
 }
 
-// Scanstack calls scanblock on each of gp's stack segments.
 static void
-scanstack(void (*scanblock)(byte*, int64), G *gp)
+addroot(Obj obj)
+{
+	uint32 cap;
+	Obj *new;
+
+	if(work.nroot >= work.rootcap) {
+		cap = PageSize/sizeof(Obj);
+		if(cap < 2*work.rootcap)
+			cap = 2*work.rootcap;
+		new = (Obj*)runtime·SysAlloc(cap*sizeof(Obj));
+		if(new == nil)
+			runtime·throw("runtime: cannot allocate memory");
+		if(work.roots != nil) {
+			runtime·memmove(new, work.roots, work.rootcap*sizeof(Obj));
+			runtime·SysFree(work.roots, work.rootcap*sizeof(Obj));
+		}
+		work.roots = new;
+		work.rootcap = cap;
+	}
+	work.roots[work.nroot] = obj;
+	work.nroot++;
+}
+
+static void
+addstackroots(G *gp)
 {
 	M *mp;
 	int32 n;
@@ -571,7 +1308,7 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
 	byte *sp, *guard;
 
 	stk = (Stktop*)gp->stackbase;
-	guard = gp->stackguard;
+	guard = (byte*)gp->stackguard;
 
 	if(gp == g) {
 		// Scanning our own stack: start at &gp.
@@ -582,72 +1319,82 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
 	} else {
 		// Scanning another goroutine's stack.
 		// The goroutine is usually asleep (the world is stopped).
-		sp = gp->sched.sp;
+		sp = (byte*)gp->sched.sp;
 
 		// The exception is that if the goroutine is about to enter or might
 		// have just exited a system call, it may be executing code such
 		// as schedlock and may have needed to start a new stack segment.
 		// Use the stack segment and stack pointer at the time of
 		// the system call instead, since that won't change underfoot.
-		if(gp->gcstack != nil) {
+		if(gp->gcstack != (uintptr)nil) {
 			stk = (Stktop*)gp->gcstack;
-			sp = gp->gcsp;
-			guard = gp->gcguard;
+			sp = (byte*)gp->gcsp;
+			guard = (byte*)gp->gcguard;
 		}
 	}
 
-	if(Debug > 1)
-		runtime·printf("scanstack %d %p\n", gp->goid, sp);
 	n = 0;
 	while(stk) {
 		if(sp < guard-StackGuard || (byte*)stk < sp) {
-			runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
+			runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
 			runtime·throw("scanstack");
 		}
-		scanblock(sp, (byte*)stk - sp);
-		sp = stk->gobuf.sp;
+		addroot((Obj){sp, (byte*)stk - sp, 0});
+		sp = (byte*)stk->gobuf.sp;
 		guard = stk->stackguard;
 		stk = (Stktop*)stk->stackbase;
 		n++;
 	}
 }
 
-// Markfin calls scanblock on the blocks that have finalizers:
-// the things pointed at cannot be freed until the finalizers have run.
 static void
-markfin(void *v)
+addfinroots(void *v)
 {
 	uintptr size;
+	void *base;
 
 	size = 0;
-	if(!runtime·mlookup(v, &v, &size, nil) || !runtime·blockspecial(v))
+	if(!runtime·mlookup(v, &base, &size, nil) || !runtime·blockspecial(base))
 		runtime·throw("mark - finalizer inconsistency");
 
 	// do not mark the finalizer block itself.  just mark the things it points at.
-	scanblock(v, size);
+	addroot((Obj){base, size, 0});
 }
 
 static void
-debug_markfin(void *v)
-{
-	uintptr size;
-
-	if(!runtime·mlookup(v, &v, &size, nil))
-		runtime·throw("debug_mark - finalizer inconsistency");
-	debug_scanblock(v, size);
-}
-
-// Mark
-static void
-mark(void (*scan)(byte*, int64))
+addroots(void)
 {
 	G *gp;
 	FinBlock *fb;
+	MSpan *s, **allspans;
+	uint32 spanidx;
+
+	work.nroot = 0;
+
+	// data & bss
+	// TODO(atom): load balancing
+	addroot((Obj){data, edata - data, (uintptr)gcdata});
+	addroot((Obj){bss, ebss - bss, (uintptr)gcbss});
+
+	// MSpan.types
+	allspans = runtime·mheap->allspans;
+	for(spanidx=0; spanidx<runtime·mheap->nspan; spanidx++) {
+		s = allspans[spanidx];
+		if(s->state == MSpanInUse) {
+			switch(s->types.compression) {
+			case MTypes_Empty:
+			case MTypes_Single:
+				break;
+			case MTypes_Words:
+			case MTypes_Bytes:
+				// TODO(atom): consider using defaultProg instead of 0
+				addroot((Obj){(byte*)&s->types.data, sizeof(void*), 0});
+				break;
+			}
+		}
+	}
 
-	// mark data+bss.
-	scan(data, ebss - data);
-
-	// mark stacks
+	// stacks
 	for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
 		switch(gp->status){
 		default:
@@ -658,37 +1405,30 @@ mark(void (*scan)(byte*, int64))
 		case Grunning:
 			if(gp != g)
 				runtime·throw("mark - world not stopped");
-			scanstack(scan, gp);
+			addstackroots(gp);
 			break;
 		case Grunnable:
 		case Gsyscall:
 		case Gwaiting:
-			scanstack(scan, gp);
+			addstackroots(gp);
 			break;
 		}
 	}
 
-	// mark things pointed at by objects with finalizers
-	if(scan == debug_scanblock)
-		runtime·walkfintab(debug_markfin);
-	else
-		runtime·walkfintab(markfin);
+	runtime·walkfintab(addfinroots);
 
 	for(fb=allfin; fb; fb=fb->alllink)
-		scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
-
-	// in multiproc mode, join in the queued work.
-	scan(nil, 0);
+		addroot((Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
 }
 
 static bool
 handlespecial(byte *p, uintptr size)
 {
-	void (*fn)(void*);
-	int32 nret;
+	FuncVal *fn;
+	uintptr nret;
 	FinBlock *block;
 	Finalizer *f;
-	
+
 	if(!runtime·getfinalizer(p, true, &fn, &nret)) {
 		runtime·setblockspecial(p, false);
 		runtime·MProf_Free(p, size);
@@ -699,6 +1439,8 @@ handlespecial(byte *p, uintptr size)
 	if(finq == nil || finq->cnt == finq->cap) {
 		if(finc == nil) {
 			finc = runtime·SysAlloc(PageSize);
+			if(finc == nil)
+				runtime·throw("runtime: cannot allocate memory");
 			finc->cap = (PageSize - sizeof(FinBlock)) / sizeof(Finalizer) + 1;
 			finc->alllink = allfin;
 			allfin = finc;
@@ -713,124 +1455,225 @@ handlespecial(byte *p, uintptr size)
 	f->fn = fn;
 	f->nret = nret;
 	f->arg = p;
-	runtime·unlock(&finlock); 
+	runtime·unlock(&finlock);
 	return true;
 }
 
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 static void
-sweep(void)
+sweepspan(ParFor *desc, uint32 idx)
 {
-	MSpan *s;
 	int32 cl, n, npages;
 	uintptr size;
 	byte *p;
 	MCache *c;
 	byte *arena_start;
-	int64 now;
+	MLink head, *end;
+	int32 nfree;
+	byte *type_data;
+	byte compression;
+	uintptr type_data_inc;
+	MSpan *s;
 
-	arena_start = runtime·mheap.arena_start;
-	now = runtime·nanotime();
+	USED(&desc);
+	s = runtime·mheap->allspans[idx];
+	if(s->state != MSpanInUse)
+		return;
+	arena_start = runtime·mheap->arena_start;
+	p = (byte*)(s->start << PageShift);
+	cl = s->sizeclass;
+	size = s->elemsize;
+	if(cl == 0) {
+		n = 1;
+	} else {
+		// Chunk full of small blocks.
+		npages = runtime·class_to_allocnpages[cl];
+		n = (npages << PageShift) / size;
+	}
+	nfree = 0;
+	end = &head;
+	c = m->mcache;
+	
+	type_data = (byte*)s->types.data;
+	type_data_inc = sizeof(uintptr);
+	compression = s->types.compression;
+	switch(compression) {
+	case MTypes_Bytes:
+		type_data += 8*sizeof(uintptr);
+		type_data_inc = 1;
+		break;
+	}
 
-	for(;;) {
-		s = work.spans;
-		if(s == nil)
-			break;
-		if(!runtime·casp(&work.spans, s, s->allnext))
-			continue;
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	for(; n > 0; n--, p += size, type_data+=type_data_inc) {
+		uintptr off, *bitp, shift, bits;
 
-		// Stamp newly unused spans. The scavenger will use that
-		// info to potentially give back some pages to the OS.
-		if(s->state == MSpanFree && s->unusedsince == 0)
-			s->unusedsince = now;
+		off = (uintptr*)p - (uintptr*)arena_start;
+		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+		shift = off % wordsPerBitmapWord;
+		bits = *bitp>>shift;
 
-		if(s->state != MSpanInUse)
+		if((bits & bitAllocated) == 0)
 			continue;
 
-		p = (byte*)(s->start << PageShift);
-		cl = s->sizeclass;
+		if((bits & bitMarked) != 0) {
+			if(DebugMark) {
+				if(!(bits & bitSpecial))
+					runtime·printf("found spurious mark on %p\n", p);
+				*bitp &= ~(bitSpecial<<shift);
+			}
+			*bitp &= ~(bitMarked<<shift);
+			continue;
+		}
+
+		// Special means it has a finalizer or is being profiled.
+		// In DebugMark mode, the bit has been coopted so
+		// we have to assume all blocks are special.
+		if(DebugMark || (bits & bitSpecial) != 0) {
+			if(handlespecial(p, size))
+				continue;
+		}
+
+		// Mark freed; restore block boundary bit.
+		*bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
+
 		if(cl == 0) {
-			size = s->npages<<PageShift;
-			n = 1;
+			// Free large span.
+			runtime·unmarkspan(p, 1<<PageShift);
+			*(uintptr*)p = 1;	// needs zeroing
+			runtime·MHeap_Free(runtime·mheap, s, 1);
+			c->local_alloc -= size;
+			c->local_nfree++;
 		} else {
-			// Chunk full of small blocks.
-			size = runtime·class_to_size[cl];
-			npages = runtime·class_to_allocnpages[cl];
-			n = (npages << PageShift) / size;
+			// Free small object.
+			switch(compression) {
+			case MTypes_Words:
+				*(uintptr*)type_data = 0;
+				break;
+			case MTypes_Bytes:
+				*(byte*)type_data = 0;
+				break;
+			}
+			if(size > sizeof(uintptr))
+				((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
+			
+			end->next = (MLink*)p;
+			end = (MLink*)p;
+			nfree++;
 		}
+	}
 
-		// Sweep through n objects of given size starting at p.
-		// This thread owns the span now, so it can manipulate
-		// the block bitmap without atomic operations.
-		for(; n > 0; n--, p += size) {
-			uintptr off, *bitp, shift, bits;
+	if(nfree) {
+		c->local_by_size[cl].nfree += nfree;
+		c->local_alloc -= size * nfree;
+		c->local_nfree += nfree;
+		c->local_cachealloc -= nfree * size;
+		c->local_objects -= nfree;
+		runtime·MCentral_FreeSpan(&runtime·mheap->central[cl], s, nfree, head.next, end);
+	}
+}
 
-			off = (uintptr*)p - (uintptr*)arena_start;
-			bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
-			shift = off % wordsPerBitmapWord;
-			bits = *bitp>>shift;
+static void
+dumpspan(uint32 idx)
+{
+	int32 sizeclass, n, npages, i, column;
+	uintptr size;
+	byte *p;
+	byte *arena_start;
+	MSpan *s;
+	bool allocated, special;
 
-			if((bits & bitAllocated) == 0)
-				continue;
+	s = runtime·mheap->allspans[idx];
+	if(s->state != MSpanInUse)
+		return;
+	arena_start = runtime·mheap->arena_start;
+	p = (byte*)(s->start << PageShift);
+	sizeclass = s->sizeclass;
+	size = s->elemsize;
+	if(sizeclass == 0) {
+		n = 1;
+	} else {
+		npages = runtime·class_to_allocnpages[sizeclass];
+		n = (npages << PageShift) / size;
+	}
+	
+	runtime·printf("%p .. %p:\n", p, p+n*size);
+	column = 0;
+	for(; n>0; n--, p+=size) {
+		uintptr off, *bitp, shift, bits;
 
-			if((bits & bitMarked) != 0) {
-				if(DebugMark) {
-					if(!(bits & bitSpecial))
-						runtime·printf("found spurious mark on %p\n", p);
-					*bitp &= ~(bitSpecial<<shift);
-				}
-				*bitp &= ~(bitMarked<<shift);
-				continue;
-			}
+		off = (uintptr*)p - (uintptr*)arena_start;
+		bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
+		shift = off % wordsPerBitmapWord;
+		bits = *bitp>>shift;
 
-			// Special means it has a finalizer or is being profiled.
-			// In DebugMark mode, the bit has been coopted so
-			// we have to assume all blocks are special.
-			if(DebugMark || (bits & bitSpecial) != 0) {
-				if(handlespecial(p, size))
-					continue;
+		allocated = ((bits & bitAllocated) != 0);
+		special = ((bits & bitSpecial) != 0);
+
+		for(i=0; i<size; i+=sizeof(void*)) {
+			if(column == 0) {
+				runtime·printf("\t");
+			}
+			if(i == 0) {
+				runtime·printf(allocated ? "(" : "[");
+				runtime·printf(special ? "@" : "");
+				runtime·printf("%p: ", p+i);
+			} else {
+				runtime·printf(" ");
 			}
 
-			// Mark freed; restore block boundary bit.
-			*bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
+			runtime·printf("%p", *(void**)(p+i));
 
-			c = m->mcache;
-			if(s->sizeclass == 0) {
-				// Free large span.
-				runtime·unmarkspan(p, 1<<PageShift);
-				*(uintptr*)p = 1;	// needs zeroing
-				runtime·MHeap_Free(&runtime·mheap, s, 1);
-			} else {
-				// Free small object.
-				if(size > sizeof(uintptr))
-					((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
-				c->local_by_size[s->sizeclass].nfree++;
-				runtime·MCache_Free(c, p, s->sizeclass, size);
+			if(i+sizeof(void*) >= size) {
+				runtime·printf(allocated ? ") " : "] ");
+			}
+
+			column++;
+			if(column == 8) {
+				runtime·printf("\n");
+				column = 0;
 			}
-			c->local_alloc -= size;
-			c->local_nfree++;
 		}
 	}
+	runtime·printf("\n");
+}
+
+// A debugging function to dump the contents of memory
+void
+runtime·memorydump(void)
+{
+	uint32 spanidx;
+
+	for(spanidx=0; spanidx<runtime·mheap->nspan; spanidx++) {
+		dumpspan(spanidx);
+	}
 }
 
 void
 runtime·gchelper(void)
 {
-	// Wait until main proc is ready for mark help.
-	runtime·lock(&work.markgate);
-	runtime·unlock(&work.markgate);
-	scanblock(nil, 0);
+	// parallel mark for over gc roots
+	runtime·parfordo(work.markfor);
+
+	// help other threads scan secondary blocks
+	scanblock(nil, nil, 0, true);
 
-	// Wait until main proc is ready for sweep help.
-	runtime·lock(&work.sweepgate);
-	runtime·unlock(&work.sweepgate);
-	sweep();
+	if(DebugMark) {
+		// wait while the main thread executes mark(debug_scanblock)
+		while(runtime·atomicload(&work.debugmarkdone) == 0)
+			runtime·usleep(10);
+	}
 
+	runtime·parfordo(work.sweepfor);
 	if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
 		runtime·notewakeup(&work.alldone);
 }
 
+#define GcpercentUnknown (-2)
+
 // Initialized from $GOGC.  GOGC=off means no gc.
 //
 // Next gc is after we've allocated an extra amount of
@@ -840,33 +1683,36 @@ runtime·gchelper(void)
 // proportion to the allocation cost.  Adjusting gcpercent
 // just changes the linear constant (and also the amount of
 // extra memory used).
-static int32 gcpercent = -2;
+static int32 gcpercent = GcpercentUnknown;
 
 static void
-stealcache(void)
+cachestats(GCStats *stats)
 {
-	M *m;
-
-	for(m=runtime·allm; m; m=m->alllink)
-		runtime·MCache_ReleaseAll(m->mcache);
-}
-
-static void
-cachestats(void)
-{
-	M *m;
+	M *mp;
 	MCache *c;
+	P *p, **pp;
 	int32 i;
 	uint64 stacks_inuse;
-	uint64 stacks_sys;
+	uint64 *src, *dst;
 
+	if(stats)
+		runtime·memclr((byte*)stats, sizeof(*stats));
 	stacks_inuse = 0;
-	stacks_sys = 0;
-	for(m=runtime·allm; m; m=m->alllink) {
-		runtime·purgecachedstats(m);
-		stacks_inuse += m->stackalloc->inuse;
-		stacks_sys += m->stackalloc->sys;
-		c = m->mcache;
+	for(mp=runtime·allm; mp; mp=mp->alllink) {
+		stacks_inuse += mp->stackinuse*FixedStack;
+		if(stats) {
+			src = (uint64*)&mp->gcstats;
+			dst = (uint64*)stats;
+			for(i=0; i<sizeof(*stats)/sizeof(uint64); i++)
+				dst[i] += src[i];
+			runtime·memclr((byte*)&mp->gcstats, sizeof(mp->gcstats));
+		}
+	}
+	for(pp=runtime·allp; p=*pp; pp++) {
+		c = p->mcache;
+		if(c==nil)
+			continue;
+		runtime·purgecachedstats(c);
 		for(i=0; i<nelem(c->local_by_size); i++) {
 			mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
 			c->local_by_size[i].nmalloc = 0;
@@ -875,16 +1721,42 @@ cachestats(void)
 		}
 	}
 	mstats.stacks_inuse = stacks_inuse;
-	mstats.stacks_sys = stacks_sys;
+}
+
+// Structure of arguments passed to function gc().
+// This allows the arguments to be passed via reflect·call.
+struct gc_args
+{
+	int32 force;
+};
+
+static void gc(struct gc_args *args);
+
+static int32
+readgogc(void)
+{
+	byte *p;
+
+	p = runtime·getenv("GOGC");
+	if(p == nil || p[0] == '\0')
+		return 100;
+	if(runtime·strcmp(p, (byte*)"off") == 0)
+		return -1;
+	return runtime·atoi(p);
 }
 
 void
 runtime·gc(int32 force)
 {
-	int64 t0, t1, t2, t3;
-	uint64 heap0, heap1, obj0, obj1;
 	byte *p;
-	bool extra;
+	struct gc_args a, *ap;
+	FuncVal gcv;
+
+	// The atomic operations are not atomic if the uint64s
+	// are not aligned on uint64 boundaries. This has been
+	// a problem in the past.
+	if((((uintptr)&work.empty) & 7) != 0)
+		runtime·throw("runtime: gc work buffer is misaligned");
 
 	// The gc is turned off (via enablegc) until
 	// the bootstrap has completed.
@@ -897,14 +1769,8 @@ runtime·gc(int32 force)
 	if(!mstats.enablegc || m->locks > 0 || runtime·panicking)
 		return;
 
-	if(gcpercent == -2) {	// first time through
-		p = runtime·getenv("GOGC");
-		if(p == nil || p[0] == '\0')
-			gcpercent = 100;
-		else if(runtime·strcmp(p, (byte*)"off") == 0)
-			gcpercent = -1;
-		else
-			gcpercent = runtime·atoi(p);
+	if(gcpercent == GcpercentUnknown) {	// first time through
+		gcpercent = readgogc();
 
 		p = runtime·getenv("GOGCTRACE");
 		if(p != nil)
@@ -913,103 +1779,171 @@ runtime·gc(int32 force)
 	if(gcpercent < 0)
 		return;
 
+	// Run gc on a bigger stack to eliminate
+	// a potentially large number of calls to runtime·morestack.
+	a.force = force;
+	ap = &a;
+	m->moreframesize_minalloc = StackBig;
+	gcv.fn = (void*)gc;
+	reflect·call(&gcv, (byte*)&ap, sizeof(ap));
+
+	if(gctrace > 1 && !force) {
+		a.force = 1;
+		gc(&a);
+	}
+}
+
+static FuncVal runfinqv = {runfinq};
+
+static void
+gc(struct gc_args *args)
+{
+	int64 t0, t1, t2, t3, t4;
+	uint64 heap0, heap1, obj0, obj1, ninstr;
+	GCStats stats;
+	M *mp;
+	uint32 i;
+	Eface eface;
+
 	runtime·semacquire(&runtime·worldsema);
-	if(!force && mstats.heap_alloc < mstats.next_gc) {
+	if(!args->force && mstats.heap_alloc < mstats.next_gc) {
 		runtime·semrelease(&runtime·worldsema);
 		return;
 	}
 
 	t0 = runtime·nanotime();
-	nhandoff = 0;
 
 	m->gcing = 1;
 	runtime·stoptheworld();
 
-	cachestats();
-	heap0 = mstats.heap_alloc;
-	obj0 = mstats.nmalloc - mstats.nfree;
+	if(CollectStats)
+		runtime·memclr((byte*)&gcstats, sizeof(gcstats));
 
-	runtime·lock(&work.markgate);
-	runtime·lock(&work.sweepgate);
+	for(mp=runtime·allm; mp; mp=mp->alllink)
+		runtime·settype_flush(mp, false);
 
-	extra = false;
-	work.nproc = 1;
-	if(runtime·gomaxprocs > 1 && runtime·ncpu > 1) {
-		runtime·noteclear(&work.alldone);
-		work.nproc += runtime·helpgc(&extra);
+	heap0 = 0;
+	obj0 = 0;
+	if(gctrace) {
+		cachestats(nil);
+		heap0 = mstats.heap_alloc;
+		obj0 = mstats.nmalloc - mstats.nfree;
 	}
+
+	m->locks++;	// disable gc during mallocs in parforalloc
+	if(work.markfor == nil)
+		work.markfor = runtime·parforalloc(MaxGcproc);
+	if(work.sweepfor == nil)
+		work.sweepfor = runtime·parforalloc(MaxGcproc);
+	m->locks--;
+
+	if(itabtype == nil) {
+		// get C pointer to the Go type "itab"
+		runtime·gc_itab_ptr(&eface);
+		itabtype = ((PtrType*)eface.type)->elem;
+	}
+
 	work.nwait = 0;
 	work.ndone = 0;
+	work.debugmarkdone = 0;
+	work.nproc = runtime·gcprocs();
+	addroots();
+	runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
+	runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap->nspan, nil, true, sweepspan);
+	if(work.nproc > 1) {
+		runtime·noteclear(&work.alldone);
+		runtime·helpgc(work.nproc);
+	}
 
-	runtime·unlock(&work.markgate);  // let the helpers in
-	mark(scanblock);
-	if(DebugMark)
-		mark(debug_scanblock);
 	t1 = runtime·nanotime();
 
-	work.spans = runtime·mheap.allspans;
-	runtime·unlock(&work.sweepgate);  // let the helpers in
-	sweep();
+	runtime·parfordo(work.markfor);
+	scanblock(nil, nil, 0, true);
+
+	if(DebugMark) {
+		for(i=0; i<work.nroot; i++)
+			debug_scanblock(work.roots[i].p, work.roots[i].n);
+		runtime·atomicstore(&work.debugmarkdone, 1);
+	}
+	t2 = runtime·nanotime();
+
+	runtime·parfordo(work.sweepfor);
+	t3 = runtime·nanotime();
+
 	if(work.nproc > 1)
 		runtime·notesleep(&work.alldone);
-	t2 = runtime·nanotime();
 
-	stealcache();
-	cachestats();
+	cachestats(&stats);
+
+	stats.nprocyield += work.sweepfor->nprocyield;
+	stats.nosyield += work.sweepfor->nosyield;
+	stats.nsleep += work.sweepfor->nsleep;
 
 	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 	m->gcing = 0;
 
-	m->locks++;	// disable gc during the mallocs in newproc
 	if(finq != nil) {
+		m->locks++;	// disable gc during the mallocs in newproc
 		// kick off or wake up goroutine to run queued finalizers
 		if(fing == nil)
-			fing = runtime·newproc1((byte*)runfinq, nil, 0, 0, runtime·gc);
+			fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
 		else if(fingwait) {
 			fingwait = 0;
 			runtime·ready(fing);
 		}
+		m->locks--;
 	}
-	m->locks--;
 
-	cachestats();
 	heap1 = mstats.heap_alloc;
 	obj1 = mstats.nmalloc - mstats.nfree;
 
-	t3 = runtime·nanotime();
-	mstats.last_gc = t3;
-	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t3 - t0;
-	mstats.pause_total_ns += t3 - t0;
+	t4 = runtime·nanotime();
+	mstats.last_gc = t4;
+	mstats.pause_ns[mstats.numgc%nelem(mstats.pause_ns)] = t4 - t0;
+	mstats.pause_total_ns += t4 - t0;
 	mstats.numgc++;
 	if(mstats.debuggc)
-		runtime·printf("pause %D\n", t3-t0);
+		runtime·printf("pause %D\n", t4-t0);
 
 	if(gctrace) {
-		runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D handoff\n",
-			mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
+		runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
+				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
+			mstats.numgc, work.nproc, (t2-t1)/1000000, (t3-t2)/1000000, (t1-t0+t4-t3)/1000000,
 			heap0>>20, heap1>>20, obj0, obj1,
 			mstats.nmalloc, mstats.nfree,
-			nhandoff);
+			stats.nhandoff, stats.nhandoffcnt,
+			work.sweepfor->nsteal, work.sweepfor->nstealcnt,
+			stats.nprocyield, stats.nosyield, stats.nsleep);
+		if(CollectStats) {
+			runtime·printf("scan: %D bytes, %D objects, %D untyped, %D types from MSpan\n",
+				gcstats.nbytes, gcstats.obj.cnt, gcstats.obj.notype, gcstats.obj.typelookup);
+			if(gcstats.ptr.cnt != 0)
+				runtime·printf("avg ptrbufsize: %D (%D/%D)\n",
+					gcstats.ptr.sum/gcstats.ptr.cnt, gcstats.ptr.sum, gcstats.ptr.cnt);
+			if(gcstats.obj.cnt != 0)
+				runtime·printf("avg nobj: %D (%D/%D)\n",
+					gcstats.obj.sum/gcstats.obj.cnt, gcstats.obj.sum, gcstats.obj.cnt);
+			runtime·printf("rescans: %D, %D bytes\n", gcstats.rescan, gcstats.rescanbytes);
+
+			runtime·printf("instruction counts:\n");
+			ninstr = 0;
+			for(i=0; i<nelem(gcstats.instr); i++) {
+				runtime·printf("\t%d:\t%D\n", i, gcstats.instr[i]);
+				ninstr += gcstats.instr[i];
+			}
+			runtime·printf("\ttotal:\t%D\n", ninstr);
+
+			runtime·printf("putempty: %D, getfull: %D\n", gcstats.putempty, gcstats.getfull);
+		}
 	}
-	
+
 	runtime·MProf_GC();
 	runtime·semrelease(&runtime·worldsema);
+	runtime·starttheworld();
 
-	// If we could have used another helper proc, start one now,
-	// in the hope that it will be available next time.
-	// It would have been even better to start it before the collection,
-	// but doing so requires allocating memory, so it's tricky to
-	// coordinate.  This lazy approach works out in practice:
-	// we don't mind if the first couple gc rounds don't have quite
-	// the maximum number of procs.
-	runtime·starttheworld(extra);
-
-	// give the queued finalizers, if any, a chance to run	
-	if(finq != nil)	
+	// give the queued finalizers, if any, a chance to run
+	if(finq != nil)
 		runtime·gosched();
-
-	if(gctrace > 1 && !force)
-		runtime·gc(1);
 }
 
 void
@@ -1022,11 +1956,56 @@ runtime·ReadMemStats(MStats *stats)
 	runtime·semacquire(&runtime·worldsema);
 	m->gcing = 1;
 	runtime·stoptheworld();
-	cachestats();
+	cachestats(nil);
 	*stats = mstats;
 	m->gcing = 0;
 	runtime·semrelease(&runtime·worldsema);
-	runtime·starttheworld(false);
+	runtime·starttheworld();
+}
+
+void
+runtime∕debug·readGCStats(Slice *pauses)
+{
+	uint64 *p;
+	uint32 i, n;
+
+	// Calling code in runtime/debug should make the slice large enough.
+	if(pauses->cap < nelem(mstats.pause_ns)+3)
+		runtime·throw("runtime: short slice passed to readGCStats");
+
+	// Pass back: pauses, last gc (absolute time), number of gc, total pause ns.
+	p = (uint64*)pauses->array;
+	runtime·lock(runtime·mheap);
+	n = mstats.numgc;
+	if(n > nelem(mstats.pause_ns))
+		n = nelem(mstats.pause_ns);
+	
+	// The pause buffer is circular. The most recent pause is at
+	// pause_ns[(numgc-1)%nelem(pause_ns)], and then backward
+	// from there to go back farther in time. We deliver the times
+	// most recent first (in p[0]).
+	for(i=0; i<n; i++)
+		p[i] = mstats.pause_ns[(mstats.numgc-1-i)%nelem(mstats.pause_ns)];
+
+	p[n] = mstats.last_gc;
+	p[n+1] = mstats.numgc;
+	p[n+2] = mstats.pause_total_ns;	
+	runtime·unlock(runtime·mheap);
+	pauses->len = n+3;
+}
+
+void
+runtime∕debug·setGCPercent(intgo in, intgo out)
+{
+	runtime·lock(runtime·mheap);
+	if(gcpercent == GcpercentUnknown)
+		gcpercent = readgogc();
+	out = gcpercent;
+	if(in < 0)
+		in = -1;
+	gcpercent = in;
+	runtime·unlock(runtime·mheap);
+	FLUSH(&out);
 }
 
 static void
@@ -1050,11 +2029,11 @@ runfinq(void)
 		finq = nil;
 		if(fb == nil) {
 			fingwait = 1;
-			g->status = Gwaiting;
-			g->waitreason = "finalizer wait";
-			runtime·gosched();
+			runtime·park(nil, nil, "finalizer wait");
 			continue;
 		}
+		if(raceenabled)
+			runtime·racefingo();
 		for(; fb; fb=next) {
 			next = fb->next;
 			for(i=0; i<fb->cnt; i++) {
@@ -1066,7 +2045,7 @@ runfinq(void)
 					framecap = framesz;
 				}
 				*(void**)frame = f->arg;
-				reflect·call((byte*)f->fn, frame, sizeof(uintptr) + f->nret);
+				reflect·call(f->fn, frame, sizeof(uintptr) + f->nret);
 				f->fn = nil;
 				f->arg = nil;
 			}
@@ -1088,11 +2067,11 @@ runtime·markallocated(void *v, uintptr n, bool noptr)
 	if(0)
 		runtime·printf("markallocated %p+%p\n", v, n);
 
-	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime·mheap->arena_used || (byte*)v < runtime·mheap->arena_start)
 		runtime·throw("markallocated: bad pointer");
 
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime·mheap->arena_start;  // word offset
+	b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
@@ -1120,11 +2099,11 @@ runtime·markfreed(void *v, uintptr n)
 	if(0)
 		runtime·printf("markallocated %p+%p\n", v, n);
 
-	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime·mheap->arena_used || (byte*)v < runtime·mheap->arena_start)
 		runtime·throw("markallocated: bad pointer");
 
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime·mheap->arena_start;  // word offset
+	b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
@@ -1150,11 +2129,11 @@ runtime·checkfreed(void *v, uintptr n)
 	if(!runtime·checking)
 		return;
 
-	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime·mheap->arena_used || (byte*)v < runtime·mheap->arena_start)
 		return;	// not allocated, so okay
 
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;  // word offset
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime·mheap->arena_start;  // word offset
+	b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	bits = *b>>shift;
@@ -1173,7 +2152,7 @@ runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
 	uintptr *b, off, shift;
 	byte *p;
 
-	if((byte*)v+size*n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+	if((byte*)v+size*n > (byte*)runtime·mheap->arena_used || (byte*)v < runtime·mheap->arena_start)
 		runtime·throw("markspan: bad pointer");
 
 	p = v;
@@ -1184,8 +2163,8 @@ runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
 		// the entire span, and each bitmap word has bits for only
 		// one span, so no other goroutines are changing these
 		// bitmap words.
-		off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;  // word offset
-		b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+		off = (uintptr*)p - (uintptr*)runtime·mheap->arena_start;  // word offset
+		b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 		shift = off % wordsPerBitmapWord;
 		*b = (*b & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
 	}
@@ -1197,14 +2176,14 @@ runtime·unmarkspan(void *v, uintptr n)
 {
 	uintptr *p, *b, off;
 
-	if((byte*)v+n > (byte*)runtime·mheap.arena_used || (byte*)v < runtime·mheap.arena_start)
+	if((byte*)v+n > (byte*)runtime·mheap->arena_used || (byte*)v < runtime·mheap->arena_start)
 		runtime·throw("markspan: bad pointer");
 
 	p = v;
-	off = p - (uintptr*)runtime·mheap.arena_start;  // word offset
+	off = p - (uintptr*)runtime·mheap->arena_start;  // word offset
 	if(off % wordsPerBitmapWord != 0)
 		runtime·throw("markspan: unaligned pointer");
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 	n /= PtrSize;
 	if(n%wordsPerBitmapWord != 0)
 		runtime·throw("unmarkspan: unaligned length");
@@ -1225,8 +2204,8 @@ runtime·blockspecial(void *v)
 	if(DebugMark)
 		return true;
 
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime·mheap->arena_start;
+	b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	return (*b & (bitSpecial<<shift)) != 0;
@@ -1240,8 +2219,8 @@ runtime·setblockspecial(void *v, bool s)
 	if(DebugMark)
 		return;
 
-	off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start;
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+	off = (uintptr*)v - (uintptr*)runtime·mheap->arena_start;
+	b = (uintptr*)runtime·mheap->arena_start - off/wordsPerBitmapWord - 1;
 	shift = off % wordsPerBitmapWord;
 
 	for(;;) {
diff --git a/src/pkg/runtime/mgc0.go b/src/pkg/runtime/mgc0.go
new file mode 100644
index 000000000..b15054662
--- /dev/null
+++ b/src/pkg/runtime/mgc0.go
@@ -0,0 +1,15 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Called from C. Returns the Go type *m.
+func gc_m_ptr(ret *interface{}) {
+	*ret = (*m)(nil)
+}
+
+// Called from C. Returns the Go type *itab.
+func gc_itab_ptr(ret *interface{}) {
+	*ret = (*itab)(nil)
+}
diff --git a/src/pkg/runtime/mgc0.h b/src/pkg/runtime/mgc0.h
new file mode 100644
index 000000000..87b604a36
--- /dev/null
+++ b/src/pkg/runtime/mgc0.h
@@ -0,0 +1,43 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector (GC)
+
+// GC instruction opcodes.
+//
+// The opcode of an instruction is followed by zero or more
+// arguments to the instruction.
+//
+// Meaning of arguments:
+//   off      Offset (in bytes) from the start of the current object
+//   objgc    Pointer to GC info of an object
+//   objgcrel Offset to GC info of an object
+//   len      Length of an array
+//   elemsize Size (in bytes) of an element
+//   size     Size (in bytes)
+enum {
+	GC_END,         // End of object, loop or subroutine. Args: none
+	GC_PTR,         // A typed pointer. Args: (off, objgc)
+	GC_APTR,        // Pointer to an arbitrary object. Args: (off)
+	GC_ARRAY_START, // Start an array with a fixed length. Args: (off, len, elemsize)
+	GC_ARRAY_NEXT,  // The next element of an array. Args: none
+	GC_CALL,        // Call a subroutine. Args: (off, objgcrel)
+	GC_MAP_PTR,     // Go map. Args: (off, MapType*)
+	GC_STRING,      // Go string. Args: (off)
+	GC_EFACE,       // interface{}. Args: (off)
+	GC_IFACE,       // interface{...}. Args: (off)
+	GC_SLICE,       // Go slice. Args: (off, objgc)
+	GC_REGION,      // A region/part of the current object. Args: (off, size, objgc)
+
+	GC_NUM_INSTR,   // Number of instruction opcodes
+};
+
+enum {
+	// Size of GC's fixed stack.
+	//
+	// The current GC implementation permits:
+	//  - at most 1 stack allocation because of GC_CALL
+	//  - at most GC_STACK_CAPACITY allocations because of GC_ARRAY_START
+	GC_STACK_CAPACITY = 8,	
+};
diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c
index c877bfca9..f45149d63 100644
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@@ -27,11 +27,26 @@ RecordSpan(void *vh, byte *p)
 {
 	MHeap *h;
 	MSpan *s;
+	MSpan **all;
+	uint32 cap;
 
 	h = vh;
 	s = (MSpan*)p;
-	s->allnext = h->allspans;
-	h->allspans = s;
+	if(h->nspan >= h->nspancap) {
+		cap = 64*1024/sizeof(all[0]);
+		if(cap < h->nspancap*3/2)
+			cap = h->nspancap*3/2;
+		all = (MSpan**)runtime·SysAlloc(cap*sizeof(all[0]));
+		if(all == nil)
+			runtime·throw("runtime: cannot allocate memory");
+		if(h->allspans) {
+			runtime·memmove(all, h->allspans, h->nspancap*sizeof(all[0]));
+			runtime·SysFree(h->allspans, h->nspancap*sizeof(all[0]));
+		}
+		h->allspans = all;
+		h->nspancap = cap;
+	}
+	h->allspans[h->nspan++] = s;
 }
 
 // Initialize the heap; fetch memory using alloc.
@@ -53,12 +68,12 @@ runtime·MHeap_Init(MHeap *h, void *(*alloc)(uintptr))
 // Allocate a new span of npage pages from the heap
 // and record its size class in the HeapMap and HeapMapCache.
 MSpan*
-runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct)
+runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct, int32 zeroed)
 {
 	MSpan *s;
 
 	runtime·lock(h);
-	runtime·purgecachedstats(m);
+	runtime·purgecachedstats(m->mcache);
 	s = MHeap_AllocLocked(h, npage, sizeclass);
 	if(s != nil) {
 		mstats.heap_inuse += npage<<PageShift;
@@ -68,6 +83,8 @@ runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct)
 		}
 	}
 	runtime·unlock(h);
+	if(s != nil && *(uintptr*)(s->start<<PageShift) != 0 && zeroed)
+		runtime·memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
 	return s;
 }
 
@@ -123,14 +140,15 @@ HaveSpan:
 		*(uintptr*)(t->start<<PageShift) = *(uintptr*)(s->start<<PageShift);  // copy "needs zeroing" mark
 		t->state = MSpanInUse;
 		MHeap_FreeLocked(h, t);
+		t->unusedsince = s->unusedsince; // preserve age
 	}
-
-	if(*(uintptr*)(s->start<<PageShift) != 0)
-		runtime·memclr((byte*)(s->start<<PageShift), s->npages<<PageShift);
+	s->unusedsince = 0;
 
 	// Record span info, because gc needs to be
 	// able to map interior pointer to containing span.
 	s->sizeclass = sizeclass;
+	s->elemsize = (sizeclass==0 ? s->npages<<PageShift : runtime·class_to_size[sizeclass]);
+	s->types.compression = MTypes_Empty;
 	p = s->start;
 	if(sizeof(void*) == 8)
 		p -= ((uintptr)h->arena_start>>PageShift);
@@ -259,7 +277,7 @@ void
 runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct)
 {
 	runtime·lock(h);
-	runtime·purgecachedstats(m);
+	runtime·purgecachedstats(m->mcache);
 	mstats.heap_inuse -= s->npages<<PageShift;
 	if(acct) {
 		mstats.heap_alloc -= s->npages<<PageShift;
@@ -276,16 +294,22 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 	MSpan *t;
 	PageID p;
 
+	if(s->types.sysalloc)
+		runtime·settype_sysfree(s);
+	s->types.compression = MTypes_Empty;
+
 	if(s->state != MSpanInUse || s->ref != 0) {
 		runtime·printf("MHeap_FreeLocked - span %p ptr %p state %d ref %d\n", s, s->start<<PageShift, s->state, s->ref);
 		runtime·throw("MHeap_FreeLocked - invalid free");
 	}
 	mstats.heap_idle += s->npages<<PageShift;
 	s->state = MSpanFree;
-	s->unusedsince = 0;
-	s->npreleased = 0;
 	runtime·MSpanList_Remove(s);
 	sp = (uintptr*)(s->start<<PageShift);
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	s->unusedsince = runtime·nanotime();
+	s->npreleased = 0;
 
 	// Coalesce with earlier, later spans.
 	p = s->start;
@@ -325,6 +349,52 @@ MHeap_FreeLocked(MHeap *h, MSpan *s)
 		runtime·MSpanList_Insert(&h->large, s);
 }
 
+static void
+forcegchelper(Note *note)
+{
+	runtime·gc(1);
+	runtime·notewakeup(note);
+}
+
+static uintptr
+scavengelist(MSpan *list, uint64 now, uint64 limit)
+{
+	uintptr released, sumreleased;
+	MSpan *s;
+
+	if(runtime·MSpanList_IsEmpty(list))
+		return 0;
+
+	sumreleased = 0;
+	for(s=list->next; s != list; s=s->next) {
+		if((now - s->unusedsince) > limit) {
+			released = (s->npages - s->npreleased) << PageShift;
+			mstats.heap_released += released;
+			sumreleased += released;
+			s->npreleased = s->npages;
+			runtime·SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
+		}
+	}
+	return sumreleased;
+}
+
+static uintptr
+scavenge(uint64 now, uint64 limit)
+{
+	uint32 i;
+	uintptr sumreleased;
+	MHeap *h;
+	
+	h = runtime·mheap;
+	sumreleased = 0;
+	for(i=0; i < nelem(h->free); i++)
+		sumreleased += scavengelist(&h->free[i], now, limit);
+	sumreleased += scavengelist(&h->large, now, limit);
+	return sumreleased;
+}
+
+static FuncVal forcegchelperv = {(void(*)(void))forcegchelper};
+
 // Release (part of) unused memory to OS.
 // Goroutine created at startup.
 // Loop forever.
@@ -332,13 +402,12 @@ void
 runtime·MHeap_Scavenger(void)
 {
 	MHeap *h;
-	MSpan *s, *list;
 	uint64 tick, now, forcegc, limit;
-	uint32 k, i;
-	uintptr released, sumreleased;
+	uint32 k;
+	uintptr sumreleased;
 	byte *env;
 	bool trace;
-	Note note;
+	Note note, *notep;
 
 	// If we go two minutes without a garbage collection, force one to run.
 	forcegc = 2*60*1e9;
@@ -356,10 +425,10 @@ runtime·MHeap_Scavenger(void)
 	if(env != nil)
 		trace = runtime·atoi(env) > 0;
 
-	h = &runtime·mheap;
+	h = runtime·mheap;
 	for(k=0;; k++) {
 		runtime·noteclear(&note);
-		runtime·entersyscall();
+		runtime·entersyscallblock();
 		runtime·notetsleep(&note, tick);
 		runtime·exitsyscall();
 
@@ -367,30 +436,21 @@ runtime·MHeap_Scavenger(void)
 		now = runtime·nanotime();
 		if(now - mstats.last_gc > forcegc) {
 			runtime·unlock(h);
-			runtime·gc(1);
+			// The scavenger can not block other goroutines,
+			// otherwise deadlock detector can fire spuriously.
+			// GC blocks other goroutines via the runtime·worldsema.
+			runtime·noteclear(&note);
+			notep = &note;
+			runtime·newproc1(&forcegchelperv, (byte*)&notep, sizeof(notep), 0, runtime·MHeap_Scavenger);
+			runtime·entersyscallblock();
+			runtime·notesleep(&note);
+			runtime·exitsyscall();
+			if(trace)
+				runtime·printf("scvg%d: GC forced\n", k);
 			runtime·lock(h);
 			now = runtime·nanotime();
-			if (trace)
-				runtime·printf("scvg%d: GC forced\n", k);
-		}
-		sumreleased = 0;
-		for(i=0; i < nelem(h->free)+1; i++) {
-			if(i < nelem(h->free))
-				list = &h->free[i];
-			else
-				list = &h->large;
-			if(runtime·MSpanList_IsEmpty(list))
-				continue;
-			for(s=list->next; s != list; s=s->next) {
-				if(s->unusedsince != 0 && (now - s->unusedsince) > limit) {
-					released = (s->npages - s->npreleased) << PageShift;
-					mstats.heap_released += released;
-					sumreleased += released;
-					s->npreleased = s->npages;
-					runtime·SysUnused((void*)(s->start << PageShift), s->npages << PageShift);
-				}
-			}
 		}
+		sumreleased = scavenge(now, limit);
 		runtime·unlock(h);
 
 		if(trace) {
@@ -403,6 +463,15 @@ runtime·MHeap_Scavenger(void)
 	}
 }
 
+void
+runtime∕debug·freeOSMemory(void)
+{
+	runtime·gc(1);
+	runtime·lock(runtime·mheap);
+	scavenge(~(uintptr)0, 0);
+	runtime·unlock(runtime·mheap);
+}
+
 // Initialize a new span with the given start and npages.
 void
 runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages)
@@ -414,9 +483,11 @@ runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages)
 	span->freelist = nil;
 	span->ref = 0;
 	span->sizeclass = 0;
+	span->elemsize = 0;
 	span->state = 0;
 	span->unusedsince = 0;
 	span->npreleased = 0;
+	span->types.compression = MTypes_Empty;
 }
 
 // Initialize an empty doubly-linked list.
diff --git a/src/pkg/runtime/mprof.goc b/src/pkg/runtime/mprof.goc
index 0bbce8583..ebc1e3e66 100644
--- a/src/pkg/runtime/mprof.goc
+++ b/src/pkg/runtime/mprof.goc
@@ -13,23 +13,73 @@ package runtime
 #include "type.h"
 
 // NOTE(rsc): Everything here could use cas if contention became an issue.
-static Lock proflock;
+static Lock proflock, alloclock;
 
-// Per-call-stack allocation information.
+// All memory allocations are local and do not escape outside of the profiler.
+// The profiler is forbidden from referring to garbage-collected memory.
+
+static byte *pool;        // memory allocation pool
+static uintptr poolfree;  // number of bytes left in the pool
+enum {
+	Chunk = 32*PageSize,  // initial size of the pool
+};
+
+// Memory allocation local to this file.
+// There is no way to return the allocated memory back to the OS.
+static void*
+allocate(uintptr size)
+{
+	void *v;
+
+	if(size == 0)
+		return nil;
+
+	if(size >= Chunk/2)
+		return runtime·SysAlloc(size);
+
+	runtime·lock(&alloclock);
+	if(size > poolfree) {
+		pool = runtime·SysAlloc(Chunk);
+		if(pool == nil)
+			runtime·throw("runtime: cannot allocate memory");
+		poolfree = Chunk;
+	}
+	v = pool;
+	pool += size;
+	poolfree -= size;
+	runtime·unlock(&alloclock);
+	return v;
+}
+
+enum { MProf, BProf };  // profile types
+
+// Per-call-stack profiling information.
 // Lookup by hashing call stack into a linked-list hash table.
 typedef struct Bucket Bucket;
 struct Bucket
 {
 	Bucket	*next;	// next in hash list
-	Bucket	*allnext;	// next in list of all buckets
-	uintptr	allocs;
-	uintptr	frees;
-	uintptr	alloc_bytes;
-	uintptr	free_bytes;
-	uintptr	recent_allocs;  // since last gc
-	uintptr	recent_frees;
-	uintptr	recent_alloc_bytes;
-	uintptr	recent_free_bytes;
+	Bucket	*allnext;	// next in list of all mbuckets/bbuckets
+	int32	typ;
+	union
+	{
+		struct  // typ == MProf
+		{
+			uintptr	allocs;
+			uintptr	frees;
+			uintptr	alloc_bytes;
+			uintptr	free_bytes;
+			uintptr	recent_allocs;  // since last gc
+			uintptr	recent_frees;
+			uintptr	recent_alloc_bytes;
+			uintptr	recent_free_bytes;
+		};
+		struct  // typ == BProf
+		{
+			int64	count;
+			int64	cycles;
+		};
+	};
 	uintptr	hash;
 	uintptr	nstk;
 	uintptr	stk[1];
@@ -38,12 +88,13 @@ enum {
 	BuckHashSize = 179999,
 };
 static Bucket **buckhash;
-static Bucket *buckets;
+static Bucket *mbuckets;  // memory profile buckets
+static Bucket *bbuckets;  // blocking profile buckets
 static uintptr bucketmem;
 
 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
 static Bucket*
-stkbucket(uintptr *stk, int32 nstk, bool alloc)
+stkbucket(int32 typ, uintptr *stk, int32 nstk, bool alloc)
 {
 	int32 i;
 	uintptr h;
@@ -51,6 +102,8 @@ stkbucket(uintptr *stk, int32 nstk, bool alloc)
 
 	if(buckhash == nil) {
 		buckhash = runtime·SysAlloc(BuckHashSize*sizeof buckhash[0]);
+		if(buckhash == nil)
+			runtime·throw("runtime: cannot allocate memory");
 		mstats.buckhash_sys += BuckHashSize*sizeof buckhash[0];
 	}
 
@@ -66,33 +119,39 @@ stkbucket(uintptr *stk, int32 nstk, bool alloc)
 
 	i = h%BuckHashSize;
 	for(b = buckhash[i]; b; b=b->next)
-		if(b->hash == h && b->nstk == nstk &&
+		if(b->typ == typ && b->hash == h && b->nstk == nstk &&
 		   runtime·mcmp((byte*)b->stk, (byte*)stk, nstk*sizeof stk[0]) == 0)
 			return b;
 
 	if(!alloc)
 		return nil;
 
-	b = runtime·mallocgc(sizeof *b + nstk*sizeof stk[0], FlagNoProfiling, 0, 1);
+	b = allocate(sizeof *b + nstk*sizeof stk[0]);
+	if(b == nil)
+		runtime·throw("runtime: cannot allocate memory");
 	bucketmem += sizeof *b + nstk*sizeof stk[0];
 	runtime·memmove(b->stk, stk, nstk*sizeof stk[0]);
+	b->typ = typ;
 	b->hash = h;
 	b->nstk = nstk;
 	b->next = buckhash[i];
 	buckhash[i] = b;
-	b->allnext = buckets;
-	buckets = b;
+	if(typ == MProf) {
+		b->allnext = mbuckets;
+		mbuckets = b;
+	} else {
+		b->allnext = bbuckets;
+		bbuckets = b;
+	}
 	return b;
 }
 
-// Record that a gc just happened: all the 'recent' statistics are now real.
-void
-runtime·MProf_GC(void)
+static void
+MProf_GC(void)
 {
 	Bucket *b;
-	
-	runtime·lock(&proflock);
-	for(b=buckets; b; b=b->allnext) {
+
+	for(b=mbuckets; b; b=b->allnext) {
 		b->allocs += b->recent_allocs;
 		b->frees += b->recent_frees;
 		b->alloc_bytes += b->recent_alloc_bytes;
@@ -102,25 +161,39 @@ runtime·MProf_GC(void)
 		b->recent_alloc_bytes = 0;
 		b->recent_free_bytes = 0;
 	}
+}
+
+// Record that a gc just happened: all the 'recent' statistics are now real.
+void
+runtime·MProf_GC(void)
+{
+	runtime·lock(&proflock);
+	MProf_GC();
 	runtime·unlock(&proflock);
 }
 
 // Map from pointer to Bucket* that allocated it.
 // Three levels:
-//	Linked-list hash table for top N-20 bits.
-//	Array index for next 13 bits.
-//	Linked list for next 7 bits.
+//	Linked-list hash table for top N-AddrHashShift bits.
+//	Array index for next AddrDenseBits bits.
+//	Linked list for next AddrHashShift-AddrDenseBits bits.
 // This is more efficient than using a general map,
 // because of the typical clustering of the pointer keys.
 
 typedef struct AddrHash AddrHash;
 typedef struct AddrEntry AddrEntry;
 
+enum {
+	AddrHashBits = 12,	// good for 4GB of used address space
+	AddrHashShift = 20,	// each AddrHash knows about 1MB of address space
+	AddrDenseBits = 8,	// good for a profiling rate of 4096 bytes
+};
+
 struct AddrHash
 {
 	AddrHash *next;	// next in top-level hash table linked list
 	uintptr addr;	// addr>>20
-	AddrEntry *dense[1<<13];
+	AddrEntry *dense[1<<AddrDenseBits];
 };
 
 struct AddrEntry
@@ -130,10 +203,7 @@ struct AddrEntry
 	Bucket *b;
 };
 
-enum {
-	AddrHashBits = 12	// 1MB per entry, so good for 4GB of used address space
-};
-static AddrHash *addrhash[1<<AddrHashBits];
+static AddrHash **addrhash;	// points to (AddrHash*)[1<<AddrHashBits]
 static AddrEntry *addrfree;
 static uintptr addrmem;
 
@@ -155,29 +225,29 @@ setaddrbucket(uintptr addr, Bucket *b)
 	AddrHash *ah;
 	AddrEntry *e;
 
-	h = (uint32)((addr>>20)*HashMultiplier) >> (32-AddrHashBits);
+	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
 	for(ah=addrhash[h]; ah; ah=ah->next)
-		if(ah->addr == (addr>>20))
+		if(ah->addr == (addr>>AddrHashShift))
 			goto found;
 
-	ah = runtime·mallocgc(sizeof *ah, FlagNoProfiling, 0, 1);
+	ah = allocate(sizeof *ah);
 	addrmem += sizeof *ah;
 	ah->next = addrhash[h];
-	ah->addr = addr>>20;
+	ah->addr = addr>>AddrHashShift;
 	addrhash[h] = ah;
 
 found:
 	if((e = addrfree) == nil) {
-		e = runtime·mallocgc(64*sizeof *e, FlagNoProfiling, 0, 0);
+		e = allocate(64*sizeof *e);
 		addrmem += 64*sizeof *e;
 		for(i=0; i+1<64; i++)
 			e[i].next = &e[i+1];
 		e[63].next = nil;
 	}
 	addrfree = e->next;
-	e->addr = (uint32)~(addr & ((1<<20)-1));
+	e->addr = (uint32)~(addr & ((1<<AddrHashShift)-1));
 	e->b = b;
-	h = (addr>>7)&(nelem(ah->dense)-1);	// entry in dense is top 13 bits of low 20.
+	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
 	e->next = ah->dense[h];
 	ah->dense[h] = e;
 }
@@ -191,16 +261,16 @@ getaddrbucket(uintptr addr)
 	AddrEntry *e, **l;
 	Bucket *b;
 
-	h = (uint32)((addr>>20)*HashMultiplier) >> (32-AddrHashBits);
+	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
 	for(ah=addrhash[h]; ah; ah=ah->next)
-		if(ah->addr == (addr>>20))
+		if(ah->addr == (addr>>AddrHashShift))
 			goto found;
 	return nil;
 
 found:
-	h = (addr>>7)&(nelem(ah->dense)-1);	// entry in dense is top 13 bits of low 20.
+	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
 	for(l=&ah->dense[h]; (e=*l) != nil; l=&e->next) {
-		if(e->addr == (uint32)~(addr & ((1<<20)-1))) {
+		if(e->addr == (uint32)~(addr & ((1<<AddrHashShift)-1))) {
 			*l = e->next;
 			b = e->b;
 			e->next = addrfree;
@@ -225,7 +295,7 @@ runtime·MProf_Malloc(void *p, uintptr size)
 	m->nomemprof++;
 	nstk = runtime·callers(1, stk, 32);
 	runtime·lock(&proflock);
-	b = stkbucket(stk, nstk, true);
+	b = stkbucket(MProf, stk, nstk, true);
 	b->recent_allocs++;
 	b->recent_alloc_bytes += size;
 	setaddrbucket((uintptr)p, b);
@@ -253,9 +323,37 @@ runtime·MProf_Free(void *p, uintptr size)
 	m->nomemprof--;
 }
 
+int64 runtime·blockprofilerate;  // in CPU ticks
+
+void
+runtime·SetBlockProfileRate(intgo rate)
+{
+	runtime·atomicstore64((uint64*)&runtime·blockprofilerate, rate * runtime·tickspersecond() / (1000*1000*1000));
+}
+
+void
+runtime·blockevent(int64 cycles, int32 skip)
+{
+	int32 nstk;
+	int64 rate;
+	uintptr stk[32];
+	Bucket *b;
 
-// Go interface to profile data.  (Declared in extern.go)
-// Assumes Go sizeof(int) == sizeof(int32)
+	if(cycles <= 0)
+		return;
+	rate = runtime·atomicload64((uint64*)&runtime·blockprofilerate);
+	if(rate <= 0 || (rate > cycles && runtime·fastrand1()%rate > cycles))
+		return;
+
+	nstk = runtime·callers(skip, stk, 32);
+	runtime·lock(&proflock);
+	b = stkbucket(BProf, stk, nstk, true);
+	b->count++;
+	b->cycles += cycles;
+	runtime·unlock(&proflock);
+}
+
+// Go interface to profile data.  (Declared in debug.go)
 
 // Must match MemProfileRecord in debug.go.
 typedef struct Record Record;
@@ -281,52 +379,101 @@ record(Record *r, Bucket *b)
 		r->stk[i] = 0;
 }
 
-func MemProfile(p Slice, include_inuse_zero bool) (n int32, ok bool) {
+func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
 	Bucket *b;
 	Record *r;
+	bool clear;
 
 	runtime·lock(&proflock);
 	n = 0;
-	for(b=buckets; b; b=b->allnext)
+	clear = true;
+	for(b=mbuckets; b; b=b->allnext) {
 		if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 			n++;
+		if(b->allocs != 0 || b->frees != 0)
+			clear = false;
+	}
+	if(clear) {
+		// Absolutely no data, suggesting that a garbage collection
+		// has not yet happened. In order to allow profiling when
+		// garbage collection is disabled from the beginning of execution,
+		// accumulate stats as if a GC just happened, and recount buckets.
+		MProf_GC();
+		n = 0;
+		for(b=mbuckets; b; b=b->allnext)
+			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
+				n++;
+	}
 	ok = false;
 	if(n <= p.len) {
 		ok = true;
 		r = (Record*)p.array;
-		for(b=buckets; b; b=b->allnext)
+		for(b=mbuckets; b; b=b->allnext)
 			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 				record(r++, b);
 	}
 	runtime·unlock(&proflock);
 }
 
+// Must match BlockProfileRecord in debug.go.
+typedef struct BRecord BRecord;
+struct BRecord {
+	int64 count;
+	int64 cycles;
+	uintptr stk[32];
+};
+
+func BlockProfile(p Slice) (n int, ok bool) {
+	Bucket *b;
+	BRecord *r;
+	int32 i;
+
+	runtime·lock(&proflock);
+	n = 0;
+	for(b=bbuckets; b; b=b->allnext)
+		n++;
+	ok = false;
+	if(n <= p.len) {
+		ok = true;
+		r = (BRecord*)p.array;
+		for(b=bbuckets; b; b=b->allnext, r++) {
+			r->count = b->count;
+			r->cycles = b->cycles;
+			for(i=0; i<b->nstk && i<nelem(r->stk); i++)
+				r->stk[i] = b->stk[i];
+			for(; i<nelem(r->stk); i++)
+				r->stk[i] = 0;			
+		}
+	}
+	runtime·unlock(&proflock);
+}
+
 // Must match StackRecord in debug.go.
 typedef struct TRecord TRecord;
 struct TRecord {
 	uintptr stk[32];
 };
 
-func ThreadCreateProfile(p Slice) (n int32, ok bool) {
+func ThreadCreateProfile(p Slice) (n int, ok bool) {
 	TRecord *r;
-	M *first, *m;
+	M *first, *mp;
 	
 	first = runtime·atomicloadp(&runtime·allm);
 	n = 0;
-	for(m=first; m; m=m->alllink)
+	for(mp=first; mp; mp=mp->alllink)
 		n++;
 	ok = false;
 	if(n <= p.len) {
 		ok = true;
 		r = (TRecord*)p.array;
-		for(m=first; m; m=m->alllink) {
-			runtime·memmove(r->stk, m->createstack, sizeof r->stk);
+		for(mp=first; mp; mp=mp->alllink) {
+			runtime·memmove(r->stk, mp->createstack, sizeof r->stk);
 			r++;
 		}
 	}
 }
 
-func Stack(b Slice, all bool) (n int32) {
+func Stack(b Slice, all bool) (n int) {
 	byte *pc, *sp;
 	
 	sp = runtime·getcallersp(&b);
@@ -355,21 +502,21 @@ func Stack(b Slice, all bool) (n int32) {
 	if(all) {
 		m->gcing = 0;
 		runtime·semrelease(&runtime·worldsema);
-		runtime·starttheworld(false);
+		runtime·starttheworld();
 	}
 }
 
 static void
-saveg(byte *pc, byte *sp, G *g, TRecord *r)
+saveg(byte *pc, byte *sp, G *gp, TRecord *r)
 {
 	int32 n;
 	
-	n = runtime·gentraceback(pc, sp, 0, g, 0, r->stk, nelem(r->stk));
+	n = runtime·gentraceback(pc, sp, 0, gp, 0, r->stk, nelem(r->stk));
 	if(n < nelem(r->stk))
 		r->stk[n] = 0;
 }
 
-func GoroutineProfile(b Slice) (n int32, ok bool) {
+func GoroutineProfile(b Slice) (n int, ok bool) {
 	byte *pc, *sp;
 	TRecord *r;
 	G *gp;
@@ -392,13 +539,18 @@ func GoroutineProfile(b Slice) (n int32, ok bool) {
 			for(gp = runtime·allg; gp != nil; gp = gp->alllink) {
 				if(gp == g || gp->status == Gdead)
 					continue;
-				saveg(gp->sched.pc, gp->sched.sp, gp, r++);
+				saveg(gp->sched.pc, (byte*)gp->sched.sp, gp, r++);
 			}
 		}
 	
 		m->gcing = 0;
 		runtime·semrelease(&runtime·worldsema);
-		runtime·starttheworld(false);
+		runtime·starttheworld();
 	}
 }
 
+void
+runtime·mprofinit(void)
+{
+	addrhash = allocate((1<<AddrHashBits)*sizeof *addrhash);
+}
diff --git a/src/pkg/runtime/os_darwin.h b/src/pkg/runtime/os_darwin.h
index eb5d2daa3..5fcb717cb 100644
--- a/src/pkg/runtime/os_darwin.h
+++ b/src/pkg/runtime/os_darwin.h
@@ -4,9 +4,11 @@
 
 #define SIG_DFL ((void*)0)
 #define SIG_IGN ((void*)1)
+#define SIGHUP 1
+#define SS_DISABLE 4
 
 int32	runtime·bsdthread_create(void*, M*, G*, void(*)(void));
-void	runtime·bsdthread_register(void);
+int32	runtime·bsdthread_register(void);
 int32	runtime·mach_msg_trap(MachHeader*, int32, uint32, uint32, uint32, uint32, uint32);
 uint32	runtime·mach_reply_port(void);
 int32	runtime·mach_semacquire(uint32, int64);
diff --git a/src/pkg/runtime/os_freebsd.h b/src/pkg/runtime/os_freebsd.h
index 5e8de5434..a37ad7cd8 100644
--- a/src/pkg/runtime/os_freebsd.h
+++ b/src/pkg/runtime/os_freebsd.h
@@ -1,5 +1,7 @@
 #define SIG_DFL ((void*)0)
 #define SIG_IGN ((void*)1)
+#define SIGHUP 1
+#define SS_DISABLE 4
 
 int32	runtime·thr_new(ThrParam*, int32);
 void	runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp);
@@ -15,7 +17,7 @@ int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
 void	runtime·raisesigpipe(void);
 
 #define	NSIG 33
-#define	SI_USER	0
+#define	SI_USER	0x10001
 
 #define RLIMIT_AS 10
 typedef struct Rlimit Rlimit;
diff --git a/src/pkg/runtime/os_linux.h b/src/pkg/runtime/os_linux.h
index 87daa3bb1..a23fe0f73 100644
--- a/src/pkg/runtime/os_linux.h
+++ b/src/pkg/runtime/os_linux.h
@@ -4,13 +4,15 @@
 
 #define SIG_DFL ((void*)0)
 #define SIG_IGN ((void*)1)
+#define SIGHUP 1
+#define SS_DISABLE 2
 
 // Linux-specific system calls
 int32	runtime·futex(uint32*, int32, uint32, Timespec*, uint32*, uint32);
 int32	runtime·clone(int32, void*, M*, G*, void(*)(void));
 
 struct Sigaction;
-void	runtime·rt_sigaction(uintptr, struct Sigaction*, void*, uintptr);
+int32	runtime·rt_sigaction(uintptr, struct Sigaction*, void*, uintptr);
 void	runtime·setsig(int32, void(*)(int32, Siginfo*, void*, G*), bool);
 void	runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp);
 
diff --git a/src/pkg/runtime/os_netbsd.h b/src/pkg/runtime/os_netbsd.h
index 4ecf78d88..19d72fd25 100644
--- a/src/pkg/runtime/os_netbsd.h
+++ b/src/pkg/runtime/os_netbsd.h
@@ -4,18 +4,29 @@
 
 #define SIG_DFL ((void*)0)
 #define SIG_IGN ((void*)1)
+#define SIGHUP 1
+#define SS_DISABLE 4
+
+#define SIG_BLOCK 1
+#define SIG_UNBLOCK 2
+#define SIG_SETMASK 3
 
 struct sigaction;
 
-void	runtime·sigpanic(void);
-void	runtime·sigaltstack(Sigaltstack*, Sigaltstack*);
-void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
+void	runtime·raisesigpipe(void);
 void	runtime·setsig(int32, void(*)(int32, Siginfo*, void*, G*), bool);
 void	runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp);
+void	runtime·sigpanic(void);
+
 void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
+void	runtime·sigaltstack(Sigaltstack*, Sigaltstack*);
+void	runtime·sigprocmask(int32, Sigset*, Sigset*);
 int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
 
-void	runtime·raisesigpipe(void);
-
 #define	NSIG 33
 #define	SI_USER	0
+
+// From NetBSD's <sys/ucontext.h>
+#define _UC_SIGMASK	0x01
+#define _UC_CPU		0x04
diff --git a/src/pkg/runtime/os_openbsd.h b/src/pkg/runtime/os_openbsd.h
index 4ecf78d88..a599aad05 100644
--- a/src/pkg/runtime/os_openbsd.h
+++ b/src/pkg/runtime/os_openbsd.h
@@ -4,18 +4,25 @@
 
 #define SIG_DFL ((void*)0)
 #define SIG_IGN ((void*)1)
+#define SIGHUP 1
+#define SS_DISABLE 4
+
+#define SIG_BLOCK 1
+#define SIG_UNBLOCK 2
+#define SIG_SETMASK 3
 
 struct sigaction;
 
+void	runtime·raisesigpipe(void);
+void	runtime·setsig(int32, void(*)(int32, Siginfo*, void*, G*), bool);
 void	runtime·sigpanic(void);
-void	runtime·sigaltstack(Sigaltstack*, Sigaltstack*);
+
+void	runtime·setitimer(int32, Itimerval*, Itimerval*);
 void	runtime·sigaction(int32, struct sigaction*, struct sigaction*);
-void	runtime·setsig(int32, void(*)(int32, Siginfo*, void*, G*), bool);
+void	runtime·sigaltstack(Sigaltstack*, Sigaltstack*);
 void	runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp);
-void	runtime·setitimer(int32, Itimerval*, Itimerval*);
+Sigset	runtime·sigprocmask(int32, Sigset);
 int32	runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr);
 
-void	runtime·raisesigpipe(void);
-
 #define	NSIG 33
 #define	SI_USER	0
diff --git a/src/pkg/runtime/os_plan9.h b/src/pkg/runtime/os_plan9.h
index cc6343c8e..c2cdf5b44 100644
--- a/src/pkg/runtime/os_plan9.h
+++ b/src/pkg/runtime/os_plan9.h
@@ -7,13 +7,22 @@ int32	runtime·open(uint8 *file, int32 mode);
 int32	runtime·pread(int32 fd, void *buf, int32 nbytes, int64 offset);
 int32	runtime·pwrite(int32 fd, void *buf, int32 nbytes, int64 offset);
 int32	runtime·read(int32 fd, void *buf, int32 nbytes);
+int64	runtime·seek(int32 fd, int64 offset, int32 whence);
 int32	runtime·close(int32 fd);
 void	runtime·exits(int8* msg);
-int32	runtime·brk_(void*);
+intptr	runtime·brk_(void*);
 int32	runtime·sleep(int32 ms);
-int32	runtime·rfork(int32 flags, void *stk, M *m, G *g, void (*fn)(void));
+int32	runtime·rfork(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
 int32	runtime·plan9_semacquire(uint32 *addr, int32 block);
+int32	runtime·plan9_tsemacquire(uint32 *addr, int32 ms);
 int32 	runtime·plan9_semrelease(uint32 *addr, int32 count);
+int32	runtime·notify(void (*fn)(void*, int8*));
+int32	runtime·noted(int32);
+void	runtime·sigtramp(void*, int8*);
+int32	runtime·sighandler(void*, int8*, G*);
+void	runtime·sigpanic(void);
+void	runtime·goexitsall(int8*);
+void	runtime·setfpmasks(void);
 
 /* open */
 enum
@@ -45,6 +54,13 @@ enum
 	RFNOMNT         = (1<<14)
 };
 
+/* notify */
+enum
+{
+	NCONT	= 0,
+	NDFLT	= 1
+};
+
 typedef struct Tos Tos;
 typedef intptr Plink;
 
@@ -66,4 +82,5 @@ struct Tos {
 	/* top of stack is here */
 };
 
-#define	NSIG 1
+#define	NSIG	5	/* number of signals in runtime·SigTab array */
+#define	ERRMAX	128	/* max length of note string */
diff --git a/src/pkg/runtime/os_windows.h b/src/pkg/runtime/os_windows.h
index 9d387b7ad..cf0ecb68e 100644
--- a/src/pkg/runtime/os_windows.h
+++ b/src/pkg/runtime/os_windows.h
@@ -28,5 +28,8 @@ uint32 runtime·ctrlhandler(uint32 type);
 byte *runtime·compilecallback(Eface fn, bool cleanstack);
 void *runtime·callbackasm(void);
 
+void runtime·install_exception_handler(void);
+void runtime·remove_exception_handler(void);
+
 // TODO(brainman): should not need those
 #define	NSIG	65
diff --git a/src/pkg/runtime/panic.c b/src/pkg/runtime/panic.c
new file mode 100644
index 000000000..2f553f417
--- /dev/null
+++ b/src/pkg/runtime/panic.c
@@ -0,0 +1,487 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "stack.h"
+
+// Code related to defer, panic and recover.
+
+uint32 runtime·panicking;
+static Lock paniclk;
+
+enum
+{
+	DeferChunkSize = 2048
+};
+
+// Allocate a Defer, usually as part of the larger frame of deferred functions.
+// Each defer must be released with both popdefer and freedefer.
+static Defer*
+newdefer(int32 siz)
+{
+	int32 total;
+	DeferChunk *c;
+	Defer *d;
+	
+	c = g->dchunk;
+	total = sizeof(*d) + ROUND(siz, sizeof(uintptr)) - sizeof(d->args);
+	if(c == nil || total > DeferChunkSize - c->off) {
+		if(total > DeferChunkSize / 2) {
+			// Not worth putting in any chunk.
+			// Allocate a separate block.
+			d = runtime·malloc(total);
+			d->siz = siz;
+			d->special = 1;
+			d->free = 1;
+			d->link = g->defer;
+			g->defer = d;
+			return d;
+		}
+
+		// Cannot fit in current chunk.
+		// Switch to next chunk, allocating if necessary.
+		c = g->dchunknext;
+		if(c == nil)
+			c = runtime·malloc(DeferChunkSize);
+		c->prev = g->dchunk;
+		c->off = sizeof(*c);
+		g->dchunk = c;
+		g->dchunknext = nil;
+	}
+
+	d = (Defer*)((byte*)c + c->off);
+	c->off += total;
+	d->siz = siz;
+	d->special = 0;
+	d->free = 0;
+	d->link = g->defer;
+	g->defer = d;
+	return d;	
+}
+
+// Pop the current defer from the defer stack.
+// Its contents are still valid until the goroutine begins executing again.
+// In particular it is safe to call reflect.call(d->fn, d->argp, d->siz) after
+// popdefer returns.
+static void
+popdefer(void)
+{
+	Defer *d;
+	DeferChunk *c;
+	int32 total;
+	
+	d = g->defer;
+	if(d == nil)
+		runtime·throw("runtime: popdefer nil");
+	g->defer = d->link;
+	if(d->special) {
+		// Nothing else to do.
+		return;
+	}
+	total = sizeof(*d) + ROUND(d->siz, sizeof(uintptr)) - sizeof(d->args);
+	c = g->dchunk;
+	if(c == nil || (byte*)d+total != (byte*)c+c->off)
+		runtime·throw("runtime: popdefer phase error");
+	c->off -= total;
+	if(c->off == sizeof(*c)) {
+		// Chunk now empty, so pop from stack.
+		// Save in dchunknext both to help with pingponging between frames
+		// and to make sure d is still valid on return.
+		if(g->dchunknext != nil)
+			runtime·free(g->dchunknext);
+		g->dchunknext = c;
+		g->dchunk = c->prev;
+	}
+}
+
+// Free the given defer.
+// For defers in the per-goroutine chunk this just clears the saved arguments.
+// For large defers allocated on the heap, this frees them.
+// The defer cannot be used after this call.
+static void
+freedefer(Defer *d)
+{
+	if(d->special) {
+		if(d->free)
+			runtime·free(d);
+	} else {
+		runtime·memclr((byte*)d->args, d->siz);
+	}
+}
+
+// Create a new deferred function fn with siz bytes of arguments.
+// The compiler turns a defer statement into a call to this.
+// Cannot split the stack because it assumes that the arguments
+// are available sequentially after &fn; they would not be
+// copied if a stack split occurred.  It's OK for this to call
+// functions that split the stack.
+#pragma textflag 7
+uintptr
+runtime·deferproc(int32 siz, FuncVal *fn, ...)
+{
+	Defer *d;
+
+	d = newdefer(siz);
+	d->fn = fn;
+	d->pc = runtime·getcallerpc(&siz);
+	if(thechar == '5')
+		d->argp = (byte*)(&fn+2);  // skip caller's saved link register
+	else
+		d->argp = (byte*)(&fn+1);
+	runtime·memmove(d->args, d->argp, d->siz);
+
+	// deferproc returns 0 normally.
+	// a deferred func that stops a panic
+	// makes the deferproc return 1.
+	// the code the compiler generates always
+	// checks the return value and jumps to the
+	// end of the function if deferproc returns != 0.
+	return 0;
+}
+
+// Run a deferred function if there is one.
+// The compiler inserts a call to this at the end of any
+// function which calls defer.
+// If there is a deferred function, this will call runtime·jmpdefer,
+// which will jump to the deferred function such that it appears
+// to have been called by the caller of deferreturn at the point
+// just before deferreturn was called.  The effect is that deferreturn
+// is called again and again until there are no more deferred functions.
+// Cannot split the stack because we reuse the caller's frame to
+// call the deferred function.
+#pragma textflag 7
+void
+runtime·deferreturn(uintptr arg0)
+{
+	Defer *d;
+	byte *argp;
+	FuncVal *fn;
+
+	d = g->defer;
+	if(d == nil)
+		return;
+	argp = (byte*)&arg0;
+	if(d->argp != argp)
+		return;
+	runtime·memmove(argp, d->args, d->siz);
+	fn = d->fn;
+	popdefer();
+	freedefer(d);
+	runtime·jmpdefer(fn, argp);
+}
+
+// Run all deferred functions for the current goroutine.
+static void
+rundefer(void)
+{
+	Defer *d;
+
+	while((d = g->defer) != nil) {
+		popdefer();
+		reflect·call(d->fn, (byte*)d->args, d->siz);
+		freedefer(d);
+	}
+}
+
+// Print all currently active panics.  Used when crashing.
+static void
+printpanics(Panic *p)
+{
+	if(p->link) {
+		printpanics(p->link);
+		runtime·printf("\t");
+	}
+	runtime·printf("panic: ");
+	runtime·printany(p->arg);
+	if(p->recovered)
+		runtime·printf(" [recovered]");
+	runtime·printf("\n");
+}
+
+static void recovery(G*);
+
+// The implementation of the predeclared function panic.
+void
+runtime·panic(Eface e)
+{
+	Defer *d;
+	Panic *p;
+	void *pc, *argp;
+	
+	p = runtime·mal(sizeof *p);
+	p->arg = e;
+	p->link = g->panic;
+	p->stackbase = (byte*)g->stackbase;
+	g->panic = p;
+
+	for(;;) {
+		d = g->defer;
+		if(d == nil)
+			break;
+		// take defer off list in case of recursive panic
+		popdefer();
+		g->ispanic = true;	// rock for newstack, where reflect.call ends up
+		argp = d->argp;
+		pc = d->pc;
+		reflect·call(d->fn, (byte*)d->args, d->siz);
+		freedefer(d);
+		if(p->recovered) {
+			g->panic = p->link;
+			if(g->panic == nil)	// must be done with signal
+				g->sig = 0;
+			runtime·free(p);
+			// Pass information about recovering frame to recovery.
+			g->sigcode0 = (uintptr)argp;
+			g->sigcode1 = (uintptr)pc;
+			runtime·mcall(recovery);
+			runtime·throw("recovery failed"); // mcall should not return
+		}
+	}
+
+	// ran out of deferred calls - old-school panic now
+	runtime·startpanic();
+	printpanics(g->panic);
+	runtime·dopanic(0);
+}
+
+// Unwind the stack after a deferred function calls recover
+// after a panic.  Then arrange to continue running as though
+// the caller of the deferred function returned normally.
+static void
+recovery(G *gp)
+{
+	void *argp;
+	void *pc;
+	
+	// Info about defer passed in G struct.
+	argp = (void*)gp->sigcode0;
+	pc = (void*)gp->sigcode1;
+
+	// Unwind to the stack frame with d's arguments in it.
+	runtime·unwindstack(gp, argp);
+
+	// Make the deferproc for this d return again,
+	// this time returning 1.  The calling function will
+	// jump to the standard return epilogue.
+	// The -2*sizeof(uintptr) makes up for the
+	// two extra words that are on the stack at
+	// each call to deferproc.
+	// (The pc we're returning to does pop pop
+	// before it tests the return value.)
+	// On the arm there are 2 saved LRs mixed in too.
+	if(thechar == '5')
+		gp->sched.sp = (uintptr)argp - 4*sizeof(uintptr);
+	else
+		gp->sched.sp = (uintptr)argp - 2*sizeof(uintptr);
+	gp->sched.pc = pc;
+	runtime·gogo(&gp->sched, 1);
+}
+
+// Free stack frames until we hit the last one
+// or until we find the one that contains the sp.
+void
+runtime·unwindstack(G *gp, byte *sp)
+{
+	Stktop *top;
+	byte *stk;
+
+	// Must be called from a different goroutine, usually m->g0.
+	if(g == gp)
+		runtime·throw("unwindstack on self");
+
+	while((top = (Stktop*)gp->stackbase) != nil && top->stackbase != nil) {
+		stk = (byte*)gp->stackguard - StackGuard;
+		if(stk <= sp && sp < (byte*)gp->stackbase)
+			break;
+		gp->stackbase = (uintptr)top->stackbase;
+		gp->stackguard = (uintptr)top->stackguard;
+		if(top->free != 0)
+			runtime·stackfree(stk, top->free);
+	}
+
+	if(sp != nil && (sp < (byte*)gp->stackguard - StackGuard || (byte*)gp->stackbase < sp)) {
+		runtime·printf("recover: %p not in [%p, %p]\n", sp, gp->stackguard - StackGuard, gp->stackbase);
+		runtime·throw("bad unwindstack");
+	}
+}
+
+// The implementation of the predeclared function recover.
+// Cannot split the stack because it needs to reliably
+// find the stack segment of its caller.
+#pragma textflag 7
+void
+runtime·recover(byte *argp, Eface ret)
+{
+	Stktop *top, *oldtop;
+	Panic *p;
+
+	// Must be a panic going on.
+	if((p = g->panic) == nil || p->recovered)
+		goto nomatch;
+
+	// Frame must be at the top of the stack segment,
+	// because each deferred call starts a new stack
+	// segment as a side effect of using reflect.call.
+	// (There has to be some way to remember the
+	// variable argument frame size, and the segment
+	// code already takes care of that for us, so we
+	// reuse it.)
+	//
+	// As usual closures complicate things: the fp that
+	// the closure implementation function claims to have
+	// is where the explicit arguments start, after the
+	// implicit pointer arguments and PC slot.
+	// If we're on the first new segment for a closure,
+	// then fp == top - top->args is correct, but if
+	// the closure has its own big argument frame and
+	// allocated a second segment (see below),
+	// the fp is slightly above top - top->args.
+	// That condition can't happen normally though
+	// (stack pointers go down, not up), so we can accept
+	// any fp between top and top - top->args as
+	// indicating the top of the segment.
+	top = (Stktop*)g->stackbase;
+	if(argp < (byte*)top - top->argsize || (byte*)top < argp)
+		goto nomatch;
+
+	// The deferred call makes a new segment big enough
+	// for the argument frame but not necessarily big
+	// enough for the function's local frame (size unknown
+	// at the time of the call), so the function might have
+	// made its own segment immediately.  If that's the
+	// case, back top up to the older one, the one that
+	// reflect.call would have made for the panic.
+	//
+	// The fp comparison here checks that the argument
+	// frame that was copied during the split (the top->args
+	// bytes above top->fp) abuts the old top of stack.
+	// This is a correct test for both closure and non-closure code.
+	oldtop = (Stktop*)top->stackbase;
+	if(oldtop != nil && top->argp == (byte*)oldtop - top->argsize)
+		top = oldtop;
+
+	// Now we have the segment that was created to
+	// run this call.  It must have been marked as a panic segment.
+	if(!top->panic)
+		goto nomatch;
+
+	// Okay, this is the top frame of a deferred call
+	// in response to a panic.  It can see the panic argument.
+	p->recovered = 1;
+	ret = p->arg;
+	FLUSH(&ret);
+	return;
+
+nomatch:
+	ret.type = nil;
+	ret.data = nil;
+	FLUSH(&ret);
+}
+
+void
+runtime·startpanic(void)
+{
+	if(m->mcache == nil)  // can happen if called from signal handler or throw
+		m->mcache = runtime·allocmcache();
+	if(m->dying) {
+		runtime·printf("panic during panic\n");
+		runtime·exit(3);
+	}
+	m->dying = 1;
+	runtime·xadd(&runtime·panicking, 1);
+	runtime·lock(&paniclk);
+}
+
+void
+runtime·dopanic(int32 unused)
+{
+	static bool didothers;
+
+	if(g->sig != 0)
+		runtime·printf("[signal %x code=%p addr=%p pc=%p]\n",
+			g->sig, g->sigcode0, g->sigcode1, g->sigpc);
+
+	if(runtime·gotraceback()){
+		if(g != m->g0) {
+			runtime·printf("\n");
+			runtime·goroutineheader(g);
+			runtime·traceback(runtime·getcallerpc(&unused), runtime·getcallersp(&unused), 0, g);
+		}
+		if(!didothers) {
+			didothers = true;
+			runtime·tracebackothers(g);
+		}
+	}
+	runtime·unlock(&paniclk);
+	if(runtime·xadd(&runtime·panicking, -1) != 0) {
+		// Some other m is panicking too.
+		// Let it print what it needs to print.
+		// Wait forever without chewing up cpu.
+		// It will exit when it's done.
+		static Lock deadlock;
+		runtime·lock(&deadlock);
+		runtime·lock(&deadlock);
+	}
+
+	runtime·exit(2);
+}
+
+void
+runtime·panicindex(void)
+{
+	runtime·panicstring("index out of range");
+}
+
+void
+runtime·panicslice(void)
+{
+	runtime·panicstring("slice bounds out of range");
+}
+
+void
+runtime·throwreturn(void)
+{
+	// can only happen if compiler is broken
+	runtime·throw("no return at end of a typed function - compiler is broken");
+}
+
+void
+runtime·throwinit(void)
+{
+	// can only happen with linker skew
+	runtime·throw("recursive call during initialization - linker skew");
+}
+
+void
+runtime·throw(int8 *s)
+{
+	if(m->throwing == 0)
+		m->throwing = 1;
+	runtime·startpanic();
+	runtime·printf("fatal error: %s\n", s);
+	runtime·dopanic(0);
+	*(int32*)0 = 0;	// not reached
+	runtime·exit(1);	// even more not reached
+}
+
+void
+runtime·panicstring(int8 *s)
+{
+	Eface err;
+
+	if(m->gcing) {
+		runtime·printf("panic: %s\n", s);
+		runtime·throw("panic during gc");
+	}
+	runtime·newErrorString(runtime·gostringnocopy((byte*)s), &err);
+	runtime·panic(err);
+}
+
+void
+runtime·Goexit(void)
+{
+	rundefer();
+	runtime·goexit();
+}
diff --git a/src/pkg/runtime/parfor.c b/src/pkg/runtime/parfor.c
new file mode 100644
index 000000000..aa5537d02
--- /dev/null
+++ b/src/pkg/runtime/parfor.c
@@ -0,0 +1,215 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Parallel for algorithm.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+
+struct ParForThread
+{
+	// the thread's iteration space [32lsb, 32msb)
+	uint64 pos;
+	// stats
+	uint64 nsteal;
+	uint64 nstealcnt;
+	uint64 nprocyield;
+	uint64 nosyield;
+	uint64 nsleep;
+	byte pad[CacheLineSize];
+};
+
+ParFor*
+runtime·parforalloc(uint32 nthrmax)
+{
+	ParFor *desc;
+
+	// The ParFor object is followed by CacheLineSize padding
+	// and then nthrmax ParForThread.
+	desc = (ParFor*)runtime·malloc(sizeof(ParFor) + CacheLineSize + nthrmax * sizeof(ParForThread));
+	desc->thr = (ParForThread*)((byte*)(desc+1) + CacheLineSize);
+	desc->nthrmax = nthrmax;
+	return desc;
+}
+
+// For testing from Go
+// func parforalloc2(nthrmax uint32) *ParFor
+void
+runtime·parforalloc2(uint32 nthrmax, ParFor *desc)
+{
+	desc = runtime·parforalloc(nthrmax);
+	FLUSH(&desc);
+}
+
+void
+runtime·parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32))
+{
+	uint32 i, begin, end;
+
+	if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
+		runtime·printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
+		runtime·throw("parfor: invalid args");
+	}
+
+	desc->body = body;
+	desc->done = 0;
+	desc->nthr = nthr;
+	desc->thrseq = 0;
+	desc->cnt = n;
+	desc->ctx = ctx;
+	desc->wait = wait;
+	desc->nsteal = 0;
+	desc->nstealcnt = 0;
+	desc->nprocyield = 0;
+	desc->nosyield = 0;
+	desc->nsleep = 0;
+	for(i=0; i<nthr; i++) {
+		begin = (uint64)n*i / nthr;
+		end = (uint64)n*(i+1) / nthr;
+		desc->thr[i].pos = (uint64)begin | (((uint64)end)<<32);
+	}
+}
+
+// For testing from Go
+// func parforsetup2(desc *ParFor, nthr, n uint32, ctx *byte, wait bool, body func(*ParFor, uint32))
+void
+runtime·parforsetup2(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void *body)
+{
+	runtime·parforsetup(desc, nthr, n, ctx, wait, *(void(**)(ParFor*, uint32))body);
+}
+
+void
+runtime·parfordo(ParFor *desc)
+{
+	ParForThread *me;
+	uint32 tid, begin, end, begin2, try, victim, i;
+	uint64 *mypos, *victimpos, pos, newpos;
+	void (*body)(ParFor*, uint32);
+	bool idle;
+
+	// Obtain 0-based thread index.
+	tid = runtime·xadd(&desc->thrseq, 1) - 1;
+	if(tid >= desc->nthr) {
+		runtime·printf("tid=%d nthr=%d\n", tid, desc->nthr);
+		runtime·throw("parfor: invalid tid");
+	}
+
+	// If single-threaded, just execute the for serially.
+	if(desc->nthr==1) {
+		for(i=0; i<desc->cnt; i++)
+			desc->body(desc, i);
+		return;
+	}
+
+	body = desc->body;
+	me = &desc->thr[tid];
+	mypos = &me->pos;
+	for(;;) {
+		for(;;) {
+			// While there is local work,
+			// bump low index and execute the iteration.
+			pos = runtime·xadd64(mypos, 1);
+			begin = (uint32)pos-1;
+			end = (uint32)(pos>>32);
+			if(begin < end) {
+				body(desc, begin);
+				continue;
+			}
+			break;
+		}
+
+		// Out of work, need to steal something.
+		idle = false;
+		for(try=0;; try++) {
+			// If we don't see any work for long enough,
+			// increment the done counter...
+			if(try > desc->nthr*4 && !idle) {
+				idle = true;
+				runtime·xadd(&desc->done, 1);
+			}
+			// ...if all threads have incremented the counter,
+			// we are done.
+			if(desc->done + !idle == desc->nthr) {
+				if(!idle)
+					runtime·xadd(&desc->done, 1);
+				goto exit;
+			}
+			// Choose a random victim for stealing.
+			victim = runtime·fastrand1() % (desc->nthr-1);
+			if(victim >= tid)
+				victim++;
+			victimpos = &desc->thr[victim].pos;
+			pos = runtime·atomicload64(victimpos);
+			for(;;) {
+				// See if it has any work.
+				begin = (uint32)pos;
+				end = (uint32)(pos>>32);
+				if(begin+1 >= end) {
+					begin = end = 0;
+					break;
+				}
+				if(idle) {
+					runtime·xadd(&desc->done, -1);
+					idle = false;
+				}
+				begin2 = begin + (end-begin)/2;
+				newpos = (uint64)begin | (uint64)begin2<<32;
+				if(runtime·cas64(victimpos, &pos, newpos)) {
+					begin = begin2;
+					break;
+				}
+			}
+			if(begin < end) {
+				// Has successfully stolen some work.
+				if(idle)
+					runtime·throw("parfor: should not be idle");
+				runtime·atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
+				me->nsteal++;
+				me->nstealcnt += end-begin;
+				break;
+			}
+			// Backoff.
+			if(try < desc->nthr) {
+				// nothing
+			} else if (try < 4*desc->nthr) {
+				me->nprocyield++;
+				runtime·procyield(20);
+			// If a caller asked not to wait for the others, exit now
+			// (assume that most work is already done at this point).
+			} else if (!desc->wait) {
+				if(!idle)
+					runtime·xadd(&desc->done, 1);
+				goto exit;
+			} else if (try < 6*desc->nthr) {
+				me->nosyield++;
+				runtime·osyield();
+			} else {
+				me->nsleep++;
+				runtime·usleep(1);
+			}
+		}
+	}
+exit:
+	runtime·xadd64(&desc->nsteal, me->nsteal);
+	runtime·xadd64(&desc->nstealcnt, me->nstealcnt);
+	runtime·xadd64(&desc->nprocyield, me->nprocyield);
+	runtime·xadd64(&desc->nosyield, me->nosyield);
+	runtime·xadd64(&desc->nsleep, me->nsleep);
+	me->nsteal = 0;
+	me->nstealcnt = 0;
+	me->nprocyield = 0;
+	me->nosyield = 0;
+	me->nsleep = 0;
+}
+
+// For testing from Go
+// func parforiters(desc *ParFor, tid uintptr) (uintptr, uintptr)
+void
+runtime·parforiters(ParFor *desc, uintptr tid, uintptr start, uintptr end)
+{
+	start = (uint32)desc->thr[tid].pos;
+	end = (uint32)(desc->thr[tid].pos>>32);
+	FLUSH(&start);
+	FLUSH(&end);
+}
diff --git a/src/pkg/runtime/parfor_test.go b/src/pkg/runtime/parfor_test.go
new file mode 100644
index 000000000..4c69a68ce
--- /dev/null
+++ b/src/pkg/runtime/parfor_test.go
@@ -0,0 +1,144 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The race detector does not understand ParFor synchronization.
+// +build !race
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+	"unsafe"
+)
+
+var gdata []uint64
+
+// Simple serial sanity test for parallelfor.
+func TestParFor(t *testing.T) {
+	const P = 1
+	const N = 20
+	data := make([]uint64, N)
+	for i := uint64(0); i < N; i++ {
+		data[i] = i
+	}
+	desc := NewParFor(P)
+	// Avoid making func a closure: parfor cannot invoke them.
+	// Since it doesn't happen in the C code, it's not worth doing
+	// just for the test.
+	gdata = data
+	ParForSetup(desc, P, N, nil, true, func(desc *ParFor, i uint32) {
+		data := gdata
+		data[i] = data[i]*data[i] + 1
+	})
+	ParForDo(desc)
+	for i := uint64(0); i < N; i++ {
+		if data[i] != i*i+1 {
+			t.Fatalf("Wrong element %d: %d", i, data[i])
+		}
+	}
+}
+
+// Test that nonblocking parallelfor does not block.
+func TestParFor2(t *testing.T) {
+	const P = 7
+	const N = 1003
+	data := make([]uint64, N)
+	for i := uint64(0); i < N; i++ {
+		data[i] = i
+	}
+	desc := NewParFor(P)
+	ParForSetup(desc, P, N, (*byte)(unsafe.Pointer(&data)), false, func(desc *ParFor, i uint32) {
+		d := *(*[]uint64)(unsafe.Pointer(desc.Ctx))
+		d[i] = d[i]*d[i] + 1
+	})
+	for p := 0; p < P; p++ {
+		ParForDo(desc)
+	}
+	for i := uint64(0); i < N; i++ {
+		if data[i] != i*i+1 {
+			t.Fatalf("Wrong element %d: %d", i, data[i])
+		}
+	}
+}
+
+// Test that iterations are properly distributed.
+func TestParForSetup(t *testing.T) {
+	const P = 11
+	const N = 101
+	desc := NewParFor(P)
+	for n := uint32(0); n < N; n++ {
+		for p := uint32(1); p <= P; p++ {
+			ParForSetup(desc, p, n, nil, true, func(desc *ParFor, i uint32) {})
+			sum := uint32(0)
+			size0 := uint32(0)
+			end0 := uint32(0)
+			for i := uint32(0); i < p; i++ {
+				begin, end := ParForIters(desc, i)
+				size := end - begin
+				sum += size
+				if i == 0 {
+					size0 = size
+					if begin != 0 {
+						t.Fatalf("incorrect begin: %d (n=%d, p=%d)", begin, n, p)
+					}
+				} else {
+					if size != size0 && size != size0+1 {
+						t.Fatalf("incorrect size: %d/%d (n=%d, p=%d)", size, size0, n, p)
+					}
+					if begin != end0 {
+						t.Fatalf("incorrect begin/end: %d/%d (n=%d, p=%d)", begin, end0, n, p)
+					}
+				}
+				end0 = end
+			}
+			if sum != n {
+				t.Fatalf("incorrect sum: %d/%d (p=%d)", sum, n, p)
+			}
+		}
+	}
+}
+
+// Test parallel parallelfor.
+func TestParForParallel(t *testing.T) {
+	if GOARCH != "amd64" {
+		t.Log("temporarily disabled, see http://golang.org/issue/4155")
+		return
+	}
+
+	N := uint64(1e7)
+	if testing.Short() {
+		N /= 10
+	}
+	data := make([]uint64, N)
+	for i := uint64(0); i < N; i++ {
+		data[i] = i
+	}
+	P := GOMAXPROCS(-1)
+	c := make(chan bool, P)
+	desc := NewParFor(uint32(P))
+	gdata = data
+	ParForSetup(desc, uint32(P), uint32(N), nil, false, func(desc *ParFor, i uint32) {
+		data := gdata
+		data[i] = data[i]*data[i] + 1
+	})
+	for p := 1; p < P; p++ {
+		go func() {
+			ParForDo(desc)
+			c <- true
+		}()
+	}
+	ParForDo(desc)
+	for p := 1; p < P; p++ {
+		<-c
+	}
+	for i := uint64(0); i < N; i++ {
+		if data[i] != i*i+1 {
+			t.Fatalf("Wrong element %d: %d", i, data[i])
+		}
+	}
+
+	data, desc = nil, nil
+	GC()
+}
diff --git a/src/pkg/runtime/pprof/pprof.go b/src/pkg/runtime/pprof/pprof.go
index f67e8a8f9..32c1098b9 100644
--- a/src/pkg/runtime/pprof/pprof.go
+++ b/src/pkg/runtime/pprof/pprof.go
@@ -36,8 +36,9 @@ import (
 //	goroutine    - stack traces of all current goroutines
 //	heap         - a sampling of all heap allocations
 //	threadcreate - stack traces that led to the creation of new OS threads
+//	block        - stack traces that led to blocking on synchronization primitives
 //
-// These predefine profiles maintain themselves and panic on an explicit
+// These predefined profiles maintain themselves and panic on an explicit
 // Add or Remove method call.
 //
 // The CPU profile is not available as a Profile.  It has a special API,
@@ -76,6 +77,12 @@ var heapProfile = &Profile{
 	write: writeHeap,
 }
 
+var blockProfile = &Profile{
+	name:  "block",
+	count: countBlock,
+	write: writeBlock,
+}
+
 func lockProfiles() {
 	profiles.mu.Lock()
 	if profiles.m == nil {
@@ -84,6 +91,7 @@ func lockProfiles() {
 			"goroutine":    goroutineProfile,
 			"threadcreate": threadcreateProfile,
 			"heap":         heapProfile,
+			"block":        blockProfile,
 		}
 	}
 }
@@ -310,21 +318,33 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 // for a single stack trace.
 func printStackRecord(w io.Writer, stk []uintptr, allFrames bool) {
 	show := allFrames
-	for _, pc := range stk {
+	wasPanic := false
+	for i, pc := range stk {
 		f := runtime.FuncForPC(pc)
 		if f == nil {
 			show = true
 			fmt.Fprintf(w, "#\t%#x\n", pc)
+			wasPanic = false
 		} else {
-			file, line := f.FileLine(pc)
+			tracepc := pc
+			// Back up to call instruction.
+			if i > 0 && pc > f.Entry() && !wasPanic {
+				if runtime.GOARCH == "386" || runtime.GOARCH == "amd64" {
+					tracepc--
+				} else {
+					tracepc -= 4 // arm, etc
+				}
+			}
+			file, line := f.FileLine(tracepc)
 			name := f.Name()
 			// Hide runtime.goexit and any runtime functions at the beginning.
 			// This is useful mainly for allocation traces.
+			wasPanic = name == "runtime.panic"
 			if name == "runtime.goexit" || !show && strings.HasPrefix(name, "runtime.") {
 				continue
 			}
 			show = true
-			fmt.Fprintf(w, "#\t%#x\t%s+%#x\t%s:%d\n", pc, f.Name(), pc-f.Entry(), file, line)
+			fmt.Fprintf(w, "#\t%#x\t%s+%#x\t%s:%d\n", pc, name, pc-f.Entry(), file, line)
 		}
 	}
 	if !show {
@@ -352,26 +372,26 @@ func WriteHeapProfile(w io.Writer) error {
 
 // countHeap returns the number of records in the heap profile.
 func countHeap() int {
-	n, _ := runtime.MemProfile(nil, false)
+	n, _ := runtime.MemProfile(nil, true)
 	return n
 }
 
-// writeHeapProfile writes the current runtime heap profile to w.
+// writeHeap writes the current runtime heap profile to w.
 func writeHeap(w io.Writer, debug int) error {
-	// Find out how many records there are (MemProfile(nil, false)),
+	// Find out how many records there are (MemProfile(nil, true)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
 	// the two calls—so allocate a few extra records for safety
 	// and also try again if we're very unlucky.
 	// The loop should only execute one iteration in the common case.
 	var p []runtime.MemProfileRecord
-	n, ok := runtime.MemProfile(nil, false)
+	n, ok := runtime.MemProfile(nil, true)
 	for {
 		// Allocate room for a slightly bigger profile,
 		// in case a few more entries have been added
 		// since the call to MemProfile.
 		p = make([]runtime.MemProfileRecord, n+50)
-		n, ok = runtime.MemProfile(p, false)
+		n, ok = runtime.MemProfile(p, true)
 		if ok {
 			p = p[0:n]
 			break
@@ -431,11 +451,14 @@ func writeHeap(w io.Writer, debug int) error {
 		fmt.Fprintf(w, "# Sys = %d\n", s.Sys)
 		fmt.Fprintf(w, "# Lookups = %d\n", s.Lookups)
 		fmt.Fprintf(w, "# Mallocs = %d\n", s.Mallocs)
+		fmt.Fprintf(w, "# Frees = %d\n", s.Frees)
 
 		fmt.Fprintf(w, "# HeapAlloc = %d\n", s.HeapAlloc)
 		fmt.Fprintf(w, "# HeapSys = %d\n", s.HeapSys)
 		fmt.Fprintf(w, "# HeapIdle = %d\n", s.HeapIdle)
 		fmt.Fprintf(w, "# HeapInuse = %d\n", s.HeapInuse)
+		fmt.Fprintf(w, "# HeapReleased = %d\n", s.HeapReleased)
+		fmt.Fprintf(w, "# HeapObjects = %d\n", s.HeapObjects)
 
 		fmt.Fprintf(w, "# Stack = %d / %d\n", s.StackInuse, s.StackSys)
 		fmt.Fprintf(w, "# MSpan = %d / %d\n", s.MSpanInuse, s.MSpanSys)
@@ -597,3 +620,60 @@ func StopCPUProfile() {
 	runtime.SetCPUProfileRate(0)
 	<-cpu.done
 }
+
+type byCycles []runtime.BlockProfileRecord
+
+func (x byCycles) Len() int           { return len(x) }
+func (x byCycles) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
+func (x byCycles) Less(i, j int) bool { return x[i].Cycles > x[j].Cycles }
+
+// countBlock returns the number of records in the blocking profile.
+func countBlock() int {
+	n, _ := runtime.BlockProfile(nil)
+	return n
+}
+
+// writeBlock writes the current blocking profile to w.
+func writeBlock(w io.Writer, debug int) error {
+	var p []runtime.BlockProfileRecord
+	n, ok := runtime.BlockProfile(nil)
+	for {
+		p = make([]runtime.BlockProfileRecord, n+50)
+		n, ok = runtime.BlockProfile(p)
+		if ok {
+			p = p[:n]
+			break
+		}
+	}
+
+	sort.Sort(byCycles(p))
+
+	b := bufio.NewWriter(w)
+	var tw *tabwriter.Writer
+	w = b
+	if debug > 0 {
+		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
+		w = tw
+	}
+
+	fmt.Fprintf(w, "--- contention:\n")
+	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
+	for i := range p {
+		r := &p[i]
+		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
+		for _, pc := range r.Stack() {
+			fmt.Fprintf(w, " %#x", pc)
+		}
+		fmt.Fprint(w, "\n")
+		if debug > 0 {
+			printStackRecord(w, r.Stack(), false)
+		}
+	}
+
+	if tw != nil {
+		tw.Flush()
+	}
+	return b.Flush()
+}
+
+func runtime_cyclesPerSecond() int64
diff --git a/src/pkg/runtime/pprof/pprof_test.go b/src/pkg/runtime/pprof/pprof_test.go
index 82bb2a292..6d5764f4a 100644
--- a/src/pkg/runtime/pprof/pprof_test.go
+++ b/src/pkg/runtime/pprof/pprof_test.go
@@ -26,8 +26,7 @@ func TestCPUProfile(t *testing.T) {
 		t.Logf("uname -a: %v", vers)
 		// Lion uses "Darwin Kernel Version 11".
 		if strings.Contains(vers, "Darwin Kernel Version 10") && strings.Contains(vers, "RELEASE_X86_64") {
-			t.Logf("skipping test on known-broken kernel (64-bit Leopard / Snow Leopard)")
-			return
+			t.Skip("skipping test on known-broken kernel (64-bit Leopard / Snow Leopard)")
 		}
 	case "plan9":
 		// unimplemented
@@ -49,19 +48,25 @@ func TestCPUProfile(t *testing.T) {
 
 	// Convert []byte to []uintptr.
 	bytes := prof.Bytes()
+	l := len(bytes) / int(unsafe.Sizeof(uintptr(0)))
 	val := *(*[]uintptr)(unsafe.Pointer(&bytes))
-	val = val[:len(bytes)/int(unsafe.Sizeof(uintptr(0)))]
+	val = val[:l]
 
-	if len(val) < 10 {
+	if l < 13 {
 		t.Fatalf("profile too short: %#x", val)
 	}
-	if val[0] != 0 || val[1] != 3 || val[2] != 0 || val[3] != 1e6/100 || val[4] != 0 {
-		t.Fatalf("unexpected header %#x", val[:5])
+
+	hd, val, tl := val[:5], val[5:l-3], val[l-3:]
+	if hd[0] != 0 || hd[1] != 3 || hd[2] != 0 || hd[3] != 1e6/100 || hd[4] != 0 {
+		t.Fatalf("unexpected header %#x", hd)
+	}
+
+	if tl[0] != 0 || tl[1] != 1 || tl[2] != 0 {
+		t.Fatalf("malformed end-of-data marker %#x", tl)
 	}
 
 	// Check that profile is well formed and contains ChecksumIEEE.
 	found := false
-	val = val[5:]
 	for len(val) > 0 {
 		if len(val) < 2 || val[0] < 1 || val[1] < 1 || uintptr(len(val)) < 2+val[1] {
 			t.Fatalf("malformed profile.  leftover: %#x", val)
diff --git a/src/pkg/runtime/print.c b/src/pkg/runtime/print.c
index 6702c3cde..5b601599b 100644
--- a/src/pkg/runtime/print.c
+++ b/src/pkg/runtime/print.c
@@ -18,10 +18,10 @@ gwrite(void *v, int32 n)
 		runtime·write(2, v, n);
 		return;
 	}
-	
+
 	if(g->writenbuf == 0)
 		return;
-	
+
 	if(n > g->writenbuf)
 		n = g->writenbuf;
 	runtime·memmove(g->writebuf, v, n);
@@ -84,40 +84,41 @@ vprintf(int8 *s, byte *base)
 		narg = 0;
 		switch(*p) {
 		case 't':
+		case 'c':
 			narg = arg + 1;
 			break;
 		case 'd':	// 32-bit
 		case 'x':
-			arg = runtime·rnd(arg, 4);
+			arg = ROUND(arg, 4);
 			narg = arg + 4;
 			break;
 		case 'D':	// 64-bit
 		case 'U':
 		case 'X':
 		case 'f':
-			arg = runtime·rnd(arg, sizeof(uintptr));
+			arg = ROUND(arg, sizeof(uintptr));
 			narg = arg + 8;
 			break;
 		case 'C':
-			arg = runtime·rnd(arg, sizeof(uintptr));
+			arg = ROUND(arg, sizeof(uintptr));
 			narg = arg + 16;
 			break;
 		case 'p':	// pointer-sized
 		case 's':
-			arg = runtime·rnd(arg, sizeof(uintptr));
+			arg = ROUND(arg, sizeof(uintptr));
 			narg = arg + sizeof(uintptr);
 			break;
 		case 'S':	// pointer-aligned but bigger
-			arg = runtime·rnd(arg, sizeof(uintptr));
+			arg = ROUND(arg, sizeof(uintptr));
 			narg = arg + sizeof(String);
 			break;
 		case 'a':	// pointer-aligned but bigger
-			arg = runtime·rnd(arg, sizeof(uintptr));
+			arg = ROUND(arg, sizeof(uintptr));
 			narg = arg + sizeof(Slice);
 			break;
 		case 'i':	// pointer-aligned but bigger
 		case 'e':
-			arg = runtime·rnd(arg, sizeof(uintptr));
+			arg = ROUND(arg, sizeof(uintptr));
 			narg = arg + sizeof(Eface);
 			break;
 		}
@@ -126,6 +127,9 @@ vprintf(int8 *s, byte *base)
 		case 'a':
 			runtime·printslice(*(Slice*)v);
 			break;
+		case 'c':
+			runtime·printbyte(*(int8*)v);
+			break;
 		case 'd':
 			runtime·printint(*(int32*)v);
 			break;
@@ -203,21 +207,27 @@ runtime·printbool(bool v)
 }
 
 void
+runtime·printbyte(int8 c)
+{
+	gwrite(&c, 1);
+}
+
+void
 runtime·printfloat(float64 v)
 {
 	byte buf[20];
 	int32 e, s, i, n;
 	float64 h;
 
-	if(runtime·isNaN(v)) {
+	if(ISNAN(v)) {
 		gwrite("NaN", 3);
 		return;
 	}
-	if(runtime·isInf(v, 1)) {
+	if(v == runtime·posinf) {
 		gwrite("+Inf", 4);
 		return;
 	}
-	if(runtime·isInf(v, -1)) {
+	if(v == runtime·neginf) {
 		gwrite("-Inf", 4);
 		return;
 	}
@@ -343,7 +353,7 @@ runtime·printstring(String v)
 	extern uint32 runtime·maxstring;
 
 	if(v.len > runtime·maxstring) {
-		gwrite("[invalid string]", 16);
+		gwrite("[string too long]", 17);
 		return;
 	}
 	if(v.len > 0)
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index 04a992628..4ce0a718c 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -4,169 +4,112 @@
 
 #include "runtime.h"
 #include "arch_GOARCH.h"
-#include "defs_GOOS_GOARCH.h"
 #include "malloc.h"
-#include "os_GOOS.h"
 #include "stack.h"
+#include "race.h"
+#include "type.h"
 
-bool	runtime·iscgo;
-
-static void unwindstack(G*, byte*);
-static void schedule(G*);
-
-typedef struct Sched Sched;
-
-M	runtime·m0;
-G	runtime·g0;	// idle goroutine for m0
-
-static	int32	debug	= 0;
-
-int32	runtime·gcwaiting;
-
-// Go scheduler
-//
-// The go scheduler's job is to match ready-to-run goroutines (`g's)
-// with waiting-for-work schedulers (`m's).  If there are ready g's
-// and no waiting m's, ready() will start a new m running in a new
-// OS thread, so that all ready g's can run simultaneously, up to a limit.
-// For now, m's never go away.
-//
-// By default, Go keeps only one kernel thread (m) running user code
-// at a single time; other threads may be blocked in the operating system.
-// Setting the environment variable $GOMAXPROCS or calling
-// runtime.GOMAXPROCS() will change the number of user threads
-// allowed to execute simultaneously.  $GOMAXPROCS is thus an
-// approximation of the maximum number of cores to use.
+// Goroutine scheduler
+// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
 //
-// Even a program that can run without deadlock in a single process
-// might use more m's if given the chance.  For example, the prime
-// sieve will use as many m's as there are primes (up to runtime·sched.mmax),
-// allowing different stages of the pipeline to execute in parallel.
-// We could revisit this choice, only kicking off new m's for blocking
-// system calls, but that would limit the amount of parallel computation
-// that go would try to do.
+// The main concepts are:
+// G - goroutine.
+// M - worker thread, or machine.
+// P - processor, a resource that is required to execute Go code.
+//     M must have an associated P to execute Go code, however it can be
+//     blocked or in a syscall w/o an associated P.
 //
-// In general, one could imagine all sorts of refinements to the
-// scheduler, but the goal now is just to get something working on
-// Linux and OS X.
+// Design doc at http://golang.org/s/go11sched.
 
+typedef struct Sched Sched;
 struct Sched {
 	Lock;
 
-	G *gfree;	// available g's (status == Gdead)
-	int32 goidgen;
+	uint64	goidgen;
 
-	G *ghead;	// g's waiting to run
-	G *gtail;
-	int32 gwait;	// number of g's waiting to run
-	int32 gcount;	// number of g's that are alive
-	int32 grunning;	// number of g's running on cpu or in syscall
+	M*	midle;	 // idle m's waiting for work
+	int32	nmidle;	 // number of idle m's waiting for work
+	int32	mlocked; // number of locked m's waiting for work
+	int32	mcount;	 // number of m's that have been created
 
-	M *mhead;	// m's waiting for work
-	int32 mwait;	// number of m's waiting for work
-	int32 mcount;	// number of m's that have been created
+	P*	pidle;  // idle P's
+	uint32	npidle;
+	uint32	nmspinning;
 
-	volatile uint32 atomic;	// atomic scheduling word (see below)
+	// Global runnable queue.
+	G*	runqhead;
+	G*	runqtail;
+	int32	runqsize;
 
-	int32 profilehz;	// cpu profiling rate
+	// Global cache of dead G's.
+	Lock	gflock;
+	G*	gfree;
 
-	bool init;  // running initialization
-	bool lockmain;  // init called runtime.LockOSThread
+	int32	stopwait;
+	Note	stopnote;
+	uint32	sysmonwait;
+	Note	sysmonnote;
 
-	Note	stopped;	// one g can set waitstop and wait here for m's to stop
+	int32	profilehz;	// cpu profiling rate
 };
 
-// The atomic word in sched is an atomic uint32 that
-// holds these fields.
-//
-//	[15 bits] mcpu		number of m's executing on cpu
-//	[15 bits] mcpumax	max number of m's allowed on cpu
-//	[1 bit] waitstop	some g is waiting on stopped
-//	[1 bit] gwaiting	gwait != 0
-//
-// These fields are the information needed by entersyscall
-// and exitsyscall to decide whether to coordinate with the
-// scheduler.  Packing them into a single machine word lets
-// them use a fast path with a single atomic read/write and
-// no lock/unlock.  This greatly reduces contention in
-// syscall- or cgo-heavy multithreaded programs.
-//
-// Except for entersyscall and exitsyscall, the manipulations
-// to these fields only happen while holding the schedlock,
-// so the routines holding schedlock only need to worry about
-// what entersyscall and exitsyscall do, not the other routines
-// (which also use the schedlock).
-//
-// In particular, entersyscall and exitsyscall only read mcpumax,
-// waitstop, and gwaiting.  They never write them.  Thus, writes to those
-// fields can be done (holding schedlock) without fear of write conflicts.
-// There may still be logic conflicts: for example, the set of waitstop must
-// be conditioned on mcpu >= mcpumax or else the wait may be a
-// spurious sleep.  The Promela model in proc.p verifies these accesses.
-enum {
-	mcpuWidth = 15,
-	mcpuMask = (1<<mcpuWidth) - 1,
-	mcpuShift = 0,
-	mcpumaxShift = mcpuShift + mcpuWidth,
-	waitstopShift = mcpumaxShift + mcpuWidth,
-	gwaitingShift = waitstopShift+1,
-
-	// The max value of GOMAXPROCS is constrained
-	// by the max value we can store in the bit fields
-	// of the atomic word.  Reserve a few high values
-	// so that we can detect accidental decrement
-	// beyond zero.
-	maxgomaxprocs = mcpuMask - 10,
-};
-
-#define atomic_mcpu(v)		(((v)>>mcpuShift)&mcpuMask)
-#define atomic_mcpumax(v)	(((v)>>mcpumaxShift)&mcpuMask)
-#define atomic_waitstop(v)	(((v)>>waitstopShift)&1)
-#define atomic_gwaiting(v)	(((v)>>gwaitingShift)&1)
-
-Sched runtime·sched;
-int32 runtime·gomaxprocs;
-bool runtime·singleproc;
-
-static bool canaddmcpu(void);
-
-// An m that is waiting for notewakeup(&m->havenextg).  This may
-// only be accessed while the scheduler lock is held.  This is used to
-// minimize the number of times we call notewakeup while the scheduler
-// lock is held, since the m will normally move quickly to lock the
-// scheduler itself, producing lock contention.
-static M* mwakeup;
-
-// Scheduling helpers.  Sched must be locked.
-static void gput(G*);	// put/get on ghead/gtail
-static G* gget(void);
-static void mput(M*);	// put/get on mhead
-static M* mget(G*);
-static void gfput(G*);	// put/get on gfree
-static G* gfget(void);
-static void matchmg(void);	// match m's to g's
-static void readylocked(G*);	// ready, but sched is locked
-static void mnextg(M*, G*);
-static void mcommoninit(M*);
-
-void
-setmcpumax(uint32 n)
-{
-	uint32 v, w;
-
-	for(;;) {
-		v = runtime·sched.atomic;
-		w = v;
-		w &= ~(mcpuMask<<mcpumaxShift);
-		w |= n<<mcpumaxShift;
-		if(runtime·cas(&runtime·sched.atomic, v, w))
-			break;
-	}
-}
+// The max value of GOMAXPROCS.
+// There are no fundamental restrictions on the value.
+enum { MaxGomaxprocs = 1<<8 };
 
+Sched	runtime·sched;
+int32	runtime·gomaxprocs;
+bool	runtime·singleproc;
+bool	runtime·iscgo;
+uint32	runtime·gcwaiting;
+M	runtime·m0;
+G	runtime·g0;	 // idle goroutine for m0
+G*	runtime·allg;
+G*	runtime·lastg;
+M*	runtime·allm;
+M*	runtime·extram;
+int8*	runtime·goos;
+int32	runtime·ncpu;
+static int32	newprocs;
 // Keep trace of scavenger's goroutine for deadlock detection.
 static G *scvg;
 
+void runtime·mstart(void);
+static void runqput(P*, G*);
+static G* runqget(P*);
+static void runqgrow(P*);
+static G* runqsteal(P*, P*);
+static void mput(M*);
+static M* mget(void);
+static void mcommoninit(M*);
+static void schedule(void);
+static void procresize(int32);
+static void acquirep(P*);
+static P* releasep(void);
+static void newm(void(*)(void), P*);
+static void goidle(void);
+static void stopm(void);
+static void startm(P*, bool);
+static void handoffp(P*);
+static void wakep(void);
+static void stoplockedm(void);
+static void startlockedm(G*);
+static void sysmon(void);
+static uint32 retake(uint32*);
+static void inclocked(int32);
+static void checkdead(void);
+static void exitsyscall0(G*);
+static void park0(G*);
+static void gosched0(G*);
+static void goexit0(G*);
+static void gfput(P*, G*);
+static G* gfget(P*);
+static void gfpurge(P*);
+static void globrunqput(G*);
+static G* globrunqget(P*);
+static P* pidleget(void);
+static void pidleput(P*);
+
 // The bootstrap sequence is:
 //
 //	call osinit
@@ -178,10 +121,11 @@ static G *scvg;
 void
 runtime·schedinit(void)
 {
-	int32 n;
+	int32 n, procs;
 	byte *p;
 
 	m->nomemprof++;
+	runtime·mprofinit();
 	runtime·mallocinit();
 	mcommoninit(m);
 
@@ -193,93 +137,70 @@ runtime·schedinit(void)
 	// so that we don't need to call malloc when we crash.
 	// runtime·findfunc(0);
 
-	runtime·gomaxprocs = 1;
+	procs = 1;
 	p = runtime·getenv("GOMAXPROCS");
-	if(p != nil && (n = runtime·atoi(p)) != 0) {
-		if(n > maxgomaxprocs)
-			n = maxgomaxprocs;
-		runtime·gomaxprocs = n;
+	if(p != nil && (n = runtime·atoi(p)) > 0) {
+		if(n > MaxGomaxprocs)
+			n = MaxGomaxprocs;
+		procs = n;
 	}
-	// wait for the main goroutine to start before taking
-	// GOMAXPROCS into account.
-	setmcpumax(1);
-	runtime·singleproc = runtime·gomaxprocs == 1;
-
-	canaddmcpu();	// mcpu++ to account for bootstrap m
-	m->helpgc = 1;	// flag to tell schedule() to mcpu--
-	runtime·sched.grunning++;
+	runtime·allp = runtime·malloc((MaxGomaxprocs+1)*sizeof(runtime·allp[0]));
+	procresize(procs);
 
 	mstats.enablegc = 1;
 	m->nomemprof--;
+
+	if(raceenabled)
+		g->racectx = runtime·raceinit();
 }
 
 extern void main·init(void);
 extern void main·main(void);
 
+static FuncVal scavenger = {runtime·MHeap_Scavenger};
+
 // The main goroutine.
 void
 runtime·main(void)
 {
+	newm(sysmon, nil);
+
 	// Lock the main goroutine onto this, the main OS thread,
 	// during initialization.  Most programs won't care, but a few
 	// do require certain calls to be made by the main thread.
 	// Those can arrange for main.main to run in the main thread
 	// by calling runtime.LockOSThread during initialization
 	// to preserve the lock.
-	runtime·LockOSThread();
-	// From now on, newgoroutines may use non-main threads.
-	setmcpumax(runtime·gomaxprocs);
-	runtime·sched.init = true;
-	scvg = runtime·newproc1((byte*)runtime·MHeap_Scavenger, nil, 0, 0, runtime·main);
+	runtime·lockOSThread();
+	if(m != &runtime·m0)
+		runtime·throw("runtime·main not on m0");
+	scvg = runtime·newproc1(&scavenger, nil, 0, 0, runtime·main);
+	scvg->issystem = true;
 	main·init();
-	runtime·sched.init = false;
-	if(!runtime·sched.lockmain)
-		runtime·UnlockOSThread();
-
-	// The deadlock detection has false negatives.
-	// Let scvg start up, to eliminate the false negative
-	// for the trivial program func main() { select{} }.
-	runtime·gosched();
+	runtime·unlockOSThread();
 
 	main·main();
+	if(raceenabled)
+		runtime·racefini();
+
+	// Make racy client program work: if panicking on
+	// another goroutine at the same time as main returns,
+	// let the other goroutine finish printing the panic trace.
+	// Once it does, it will exit. See issue 3934.
+	if(runtime·panicking)
+		runtime·park(nil, nil, "panicwait");
+
 	runtime·exit(0);
 	for(;;)
 		*(int32*)runtime·main = 0;
 }
 
-// Lock the scheduler.
-static void
-schedlock(void)
-{
-	runtime·lock(&runtime·sched);
-}
-
-// Unlock the scheduler.
-static void
-schedunlock(void)
-{
-	M *m;
-
-	m = mwakeup;
-	mwakeup = nil;
-	runtime·unlock(&runtime·sched);
-	if(m != nil)
-		runtime·notewakeup(&m->havenextg);
-}
-
-void
-runtime·goexit(void)
-{
-	g->status = Gmoribund;
-	runtime·gosched();
-}
-
 void
-runtime·goroutineheader(G *g)
+runtime·goroutineheader(G *gp)
 {
 	int8 *status;
 
-	switch(g->status) {
+	switch(gp->status) {
 	case Gidle:
 		status = "idle";
 		break;
@@ -293,609 +214,748 @@ runtime·goroutineheader(G *g)
 		status = "syscall";
 		break;
 	case Gwaiting:
-		if(g->waitreason)
-			status = g->waitreason;
+		if(gp->waitreason)
+			status = gp->waitreason;
 		else
 			status = "waiting";
 		break;
-	case Gmoribund:
-		status = "moribund";
-		break;
 	default:
 		status = "???";
 		break;
 	}
-	runtime·printf("goroutine %d [%s]:\n", g->goid, status);
+	runtime·printf("goroutine %D [%s]:\n", gp->goid, status);
 }
 
 void
 runtime·tracebackothers(G *me)
 {
-	G *g;
+	G *gp;
+	int32 traceback;
 
-	for(g = runtime·allg; g != nil; g = g->alllink) {
-		if(g == me || g->status == Gdead)
+	traceback = runtime·gotraceback();
+	for(gp = runtime·allg; gp != nil; gp = gp->alllink) {
+		if(gp == me || gp->status == Gdead)
+			continue;
+		if(gp->issystem && traceback < 2)
 			continue;
 		runtime·printf("\n");
-		runtime·goroutineheader(g);
-		runtime·traceback(g->sched.pc, g->sched.sp, 0, g);
+		runtime·goroutineheader(gp);
+		runtime·traceback(gp->sched.pc, (byte*)gp->sched.sp, 0, gp);
 	}
 }
 
-// Mark this g as m's idle goroutine.
-// This functionality might be used in environments where programs
-// are limited to a single thread, to simulate a select-driven
-// network server.  It is not exposed via the standard runtime API.
-void
-runtime·idlegoroutine(void)
-{
-	if(g->idlem != nil)
-		runtime·throw("g is already an idle goroutine");
-	g->idlem = m;
-}
-
 static void
-mcommoninit(M *m)
+mcommoninit(M *mp)
 {
-	m->id = runtime·sched.mcount++;
-	m->fastrand = 0x49f6428aUL + m->id + runtime·cputicks();
-	m->stackalloc = runtime·malloc(sizeof(*m->stackalloc));
-	runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
+	// If there is no mcache runtime·callers() will crash,
+	// and we are most likely in sysmon thread so the stack is senseless anyway.
+	if(m->mcache)
+		runtime·callers(1, mp->createstack, nelem(mp->createstack));
 
-	if(m->mcache == nil)
-		m->mcache = runtime·allocmcache();
+	mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
 
-	runtime·callers(1, m->createstack, nelem(m->createstack));
+	runtime·lock(&runtime·sched);
+	mp->id = runtime·sched.mcount++;
+
+	runtime·mpreinit(mp);
 
 	// Add to runtime·allm so garbage collector doesn't free m
 	// when it is just in a register or thread-local storage.
-	m->alllink = runtime·allm;
+	mp->alllink = runtime·allm;
 	// runtime·NumCgoCall() iterates over allm w/o schedlock,
 	// so we need to publish it safely.
-	runtime·atomicstorep(&runtime·allm, m);
+	runtime·atomicstorep(&runtime·allm, mp);
+	runtime·unlock(&runtime·sched);
 }
 
-// Try to increment mcpu.  Report whether succeeded.
-static bool
-canaddmcpu(void)
+// Mark gp ready to run.
+void
+runtime·ready(G *gp)
 {
-	uint32 v;
-
-	for(;;) {
-		v = runtime·sched.atomic;
-		if(atomic_mcpu(v) >= atomic_mcpumax(v))
-			return 0;
-		if(runtime·cas(&runtime·sched.atomic, v, v+(1<<mcpuShift)))
-			return 1;
+	// Mark runnable.
+	if(gp->status != Gwaiting) {
+		runtime·printf("goroutine %D has status %d\n", gp->goid, gp->status);
+		runtime·throw("bad g->status in ready");
 	}
+	gp->status = Grunnable;
+	runqput(m->p, gp);
+	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0)  // TODO: fast atomic
+		wakep();
 }
 
-// Put on `g' queue.  Sched must be locked.
-static void
-gput(G *g)
+int32
+runtime·gcprocs(void)
 {
-	M *m;
-
-	// If g is wired, hand it off directly.
-	if((m = g->lockedm) != nil && canaddmcpu()) {
-		mnextg(m, g);
-		return;
-	}
+	int32 n;
 
-	// If g is the idle goroutine for an m, hand it off.
-	if(g->idlem != nil) {
-		if(g->idlem->idleg != nil) {
-			runtime·printf("m%d idle out of sync: g%d g%d\n",
-				g->idlem->id,
-				g->idlem->idleg->goid, g->goid);
-			runtime·throw("runtime: double idle");
-		}
-		g->idlem->idleg = g;
-		return;
-	}
+	// Figure out how many CPUs to use during GC.
+	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+	runtime·lock(&runtime·sched);
+	n = runtime·gomaxprocs;
+	if(n > runtime·ncpu)
+		n = runtime·ncpu;
+	if(n > MaxGcproc)
+		n = MaxGcproc;
+	if(n > runtime·sched.nmidle+1) // one M is currently running
+		n = runtime·sched.nmidle+1;
+	runtime·unlock(&runtime·sched);
+	return n;
+}
 
-	g->schedlink = nil;
-	if(runtime·sched.ghead == nil)
-		runtime·sched.ghead = g;
-	else
-		runtime·sched.gtail->schedlink = g;
-	runtime·sched.gtail = g;
+static bool
+needaddgcproc(void)
+{
+	int32 n;
 
-	// increment gwait.
-	// if it transitions to nonzero, set atomic gwaiting bit.
-	if(runtime·sched.gwait++ == 0)
-		runtime·xadd(&runtime·sched.atomic, 1<<gwaitingShift);
+	runtime·lock(&runtime·sched);
+	n = runtime·gomaxprocs;
+	if(n > runtime·ncpu)
+		n = runtime·ncpu;
+	if(n > MaxGcproc)
+		n = MaxGcproc;
+	n -= runtime·sched.nmidle+1; // one M is currently running
+	runtime·unlock(&runtime·sched);
+	return n > 0;
 }
 
-// Report whether gget would return something.
-static bool
-haveg(void)
+void
+runtime·helpgc(int32 nproc)
 {
-	return runtime·sched.ghead != nil || m->idleg != nil;
+	M *mp;
+	int32 n, pos;
+
+	runtime·lock(&runtime·sched);
+	pos = 0;
+	for(n = 1; n < nproc; n++) {  // one M is currently running
+		if(runtime·allp[pos]->mcache == m->mcache)
+			pos++;
+		mp = mget();
+		if(mp == nil)
+			runtime·throw("runtime·gcprocs inconsistency");
+		mp->helpgc = 1;
+		mp->mcache = runtime·allp[pos]->mcache;
+		pos++;
+		runtime·notewakeup(&mp->park);
+	}
+	runtime·unlock(&runtime·sched);
 }
 
-// Get from `g' queue.  Sched must be locked.
-static G*
-gget(void)
+void
+runtime·stoptheworld(void)
 {
-	G *g;
+	int32 i;
+	uint32 s;
+	P *p;
+	bool wait;
 
-	g = runtime·sched.ghead;
-	if(g){
-		runtime·sched.ghead = g->schedlink;
-		if(runtime·sched.ghead == nil)
-			runtime·sched.gtail = nil;
-		// decrement gwait.
-		// if it transitions to zero, clear atomic gwaiting bit.
-		if(--runtime·sched.gwait == 0)
-			runtime·xadd(&runtime·sched.atomic, -1<<gwaitingShift);
-	} else if(m->idleg != nil) {
-		g = m->idleg;
-		m->idleg = nil;
+	runtime·lock(&runtime·sched);
+	runtime·sched.stopwait = runtime·gomaxprocs;
+	runtime·atomicstore((uint32*)&runtime·gcwaiting, 1);
+	// stop current P
+	m->p->status = Pgcstop;
+	runtime·sched.stopwait--;
+	// try to retake all P's in Psyscall status
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		s = p->status;
+		if(s == Psyscall && runtime·cas(&p->status, s, Pgcstop))
+			runtime·sched.stopwait--;
+	}
+	// stop idle P's
+	while(p = pidleget()) {
+		p->status = Pgcstop;
+		runtime·sched.stopwait--;
+	}
+	wait = runtime·sched.stopwait > 0;
+	runtime·unlock(&runtime·sched);
+
+	// wait for remaining P's to stop voluntary
+	if(wait) {
+		runtime·notesleep(&runtime·sched.stopnote);
+		runtime·noteclear(&runtime·sched.stopnote);
+	}
+	if(runtime·sched.stopwait)
+		runtime·throw("stoptheworld: not stopped");
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p->status != Pgcstop)
+			runtime·throw("stoptheworld: not stopped");
 	}
-	return g;
 }
 
-// Put on `m' list.  Sched must be locked.
 static void
-mput(M *m)
+mhelpgc(void)
 {
-	m->schedlink = runtime·sched.mhead;
-	runtime·sched.mhead = m;
-	runtime·sched.mwait++;
+	m->helpgc = 1;
 }
 
-// Get an `m' to run `g'.  Sched must be locked.
-static M*
-mget(G *g)
+void
+runtime·starttheworld(void)
 {
-	M *m;
+	P *p;
+	M *mp;
+	bool add;
 
-	// if g has its own m, use it.
-	if(g && (m = g->lockedm) != nil)
-		return m;
+	add = needaddgcproc();
+	runtime·lock(&runtime·sched);
+	if(newprocs) {
+		procresize(newprocs);
+		newprocs = 0;
+	} else
+		procresize(runtime·gomaxprocs);
+	runtime·gcwaiting = 0;
 
-	// otherwise use general m pool.
-	if((m = runtime·sched.mhead) != nil){
-		runtime·sched.mhead = m->schedlink;
-		runtime·sched.mwait--;
+	while(p = pidleget()) {
+		// procresize() puts p's with work at the beginning of the list.
+		// Once we reach a p without a run queue, the rest don't have one either.
+		if(p->runqhead == p->runqtail) {
+			pidleput(p);
+			break;
+		}
+		mp = mget();
+		if(mp == nil) {
+			pidleput(p);
+			break;
+		}
+		if(mp->nextp)
+			runtime·throw("starttheworld: inconsistent mp->nextp");
+		mp->nextp = p;
+		runtime·notewakeup(&mp->park);
+	}
+	if(runtime·sched.sysmonwait) {
+		runtime·sched.sysmonwait = false;
+		runtime·notewakeup(&runtime·sched.sysmonnote);
+	}
+	runtime·unlock(&runtime·sched);
+
+	if(add) {
+		// If GC could have used another helper proc, start one now,
+		// in the hope that it will be available next time.
+		// It would have been even better to start it before the collection,
+		// but doing so requires allocating memory, so it's tricky to
+		// coordinate.  This lazy approach works out in practice:
+		// we don't mind if the first couple gc rounds don't have quite
+		// the maximum number of procs.
+		newm(mhelpgc, nil);
 	}
-	return m;
 }
 
-// Mark g ready to run.
+// Called to start an M.
 void
-runtime·ready(G *g)
+runtime·mstart(void)
 {
-	schedlock();
-	readylocked(g);
-	schedunlock();
-}
+	// It is used by windows-386 only. Unfortunately, seh needs
+	// to be located on os stack, and mstart runs on os stack
+	// for both m0 and m.
+	SEH seh;
 
-// Mark g ready to run.  Sched is already locked.
-// G might be running already and about to stop.
-// The sched lock protects g->status from changing underfoot.
-static void
-readylocked(G *g)
-{
-	if(g->m){
-		// Running on another machine.
-		// Ready it when it stops.
-		g->readyonstop = 1;
-		return;
+	if(g != m->g0)
+		runtime·throw("bad runtime·mstart");
+
+	// Record top of stack for use by mcall.
+	// Once we call schedule we're never coming back,
+	// so other calls can reuse this stack space.
+	runtime·gosave(&m->g0->sched);
+	m->g0->sched.pc = (void*)-1;  // make sure it is never used
+	m->seh = &seh;
+	runtime·asminit();
+	runtime·minit();
+
+	// Install signal handlers; after minit so that minit can
+	// prepare the thread to be able to handle the signals.
+	if(m == &runtime·m0) {
+		runtime·initsig();
+		if(runtime·iscgo)
+			runtime·newextram();
 	}
+	
+	if(m->mstartfn)
+		m->mstartfn();
 
-	// Mark runnable.
-	if(g->status == Grunnable || g->status == Grunning) {
-		runtime·printf("goroutine %d has status %d\n", g->goid, g->status);
-		runtime·throw("bad g->status in ready");
+	if(m->helpgc) {
+		m->helpgc = false;
+		stopm();
+	} else if(m != &runtime·m0) {
+		acquirep(m->nextp);
+		m->nextp = nil;
 	}
-	g->status = Grunnable;
+	schedule();
 
-	gput(g);
-	matchmg();
+	// TODO(brainman): This point is never reached, because scheduler
+	// does not release os threads at the moment. But once this path
+	// is enabled, we must remove our seh here.
 }
 
-static void
-nop(void)
-{
-}
+// When running with cgo, we call _cgo_thread_start
+// to start threads for us so that we can play nicely with
+// foreign code.
+void (*_cgo_thread_start)(void*);
 
-// Same as readylocked but a different symbol so that
-// debuggers can set a breakpoint here and catch all
-// new goroutines.
-static void
-newprocreadylocked(G *g)
+typedef struct CgoThreadStart CgoThreadStart;
+struct CgoThreadStart
 {
-	nop();	// avoid inlining in 6l
-	readylocked(g);
-}
+	M *m;
+	G *g;
+	void (*fn)(void);
+};
 
-// Pass g to m for running.
-// Caller has already incremented mcpu.
-static void
-mnextg(M *m, G *g)
+// Allocate a new m unassociated with any thread.
+// Can use p for allocation context if needed.
+M*
+runtime·allocm(P *p)
 {
-	runtime·sched.grunning++;
-	m->nextg = g;
-	if(m->waitnextg) {
-		m->waitnextg = 0;
-		if(mwakeup != nil)
-			runtime·notewakeup(&mwakeup->havenextg);
-		mwakeup = m;
+	M *mp;
+	static Type *mtype;  // The Go type M
+
+	m->locks++;  // disable GC because it can be called from sysmon
+	if(m->p == nil)
+		acquirep(p);  // temporarily borrow p for mallocs in this function
+	if(mtype == nil) {
+		Eface e;
+		runtime·gc_m_ptr(&e);
+		mtype = ((PtrType*)e.type)->elem;
 	}
-}
 
-// Get the next goroutine that m should run.
-// Sched must be locked on entry, is unlocked on exit.
-// Makes sure that at most $GOMAXPROCS g's are
-// running on cpus (not in system calls) at any given time.
-static G*
-nextgandunlock(void)
-{
-	G *gp;
-	uint32 v;
+	mp = runtime·cnew(mtype);
+	mcommoninit(mp);
 
-top:
-	if(atomic_mcpu(runtime·sched.atomic) >= maxgomaxprocs)
-		runtime·throw("negative mcpu");
-
-	// If there is a g waiting as m->nextg, the mcpu++
-	// happened before it was passed to mnextg.
-	if(m->nextg != nil) {
-		gp = m->nextg;
-		m->nextg = nil;
-		schedunlock();
-		return gp;
-	}
+	// In case of cgo, pthread_create will make us a stack.
+	// Windows will layout sched stack on OS stack.
+	if(runtime·iscgo || Windows)
+		mp->g0 = runtime·malg(-1);
+	else
+		mp->g0 = runtime·malg(8192);
 
-	if(m->lockedg != nil) {
-		// We can only run one g, and it's not available.
-		// Make sure some other cpu is running to handle
-		// the ordinary run queue.
-		if(runtime·sched.gwait != 0) {
-			matchmg();
-			// m->lockedg might have been on the queue.
-			if(m->nextg != nil) {
-				gp = m->nextg;
-				m->nextg = nil;
-				schedunlock();
-				return gp;
-			}
-		}
-	} else {
-		// Look for work on global queue.
-		while(haveg() && canaddmcpu()) {
-			gp = gget();
-			if(gp == nil)
-				runtime·throw("gget inconsistency");
-
-			if(gp->lockedm) {
-				mnextg(gp->lockedm, gp);
-				continue;
-			}
-			runtime·sched.grunning++;
-			schedunlock();
-			return gp;
-		}
+	if(p == m->p)
+		releasep();
+	m->locks--;
 
-		// The while loop ended either because the g queue is empty
-		// or because we have maxed out our m procs running go
-		// code (mcpu >= mcpumax).  We need to check that
-		// concurrent actions by entersyscall/exitsyscall cannot
-		// invalidate the decision to end the loop.
-		//
-		// We hold the sched lock, so no one else is manipulating the
-		// g queue or changing mcpumax.  Entersyscall can decrement
-		// mcpu, but if does so when there is something on the g queue,
-		// the gwait bit will be set, so entersyscall will take the slow path
-		// and use the sched lock.  So it cannot invalidate our decision.
-		//
-		// Wait on global m queue.
-		mput(m);
-	}
-
-	// Look for deadlock situation.
-	// There is a race with the scavenger that causes false negatives:
-	// if the scavenger is just starting, then we have
-	//	scvg != nil && grunning == 0 && gwait == 0
-	// and we do not detect a deadlock.  It is possible that we should
-	// add that case to the if statement here, but it is too close to Go 1
-	// to make such a subtle change.  Instead, we work around the
-	// false negative in trivial programs by calling runtime.gosched
-	// from the main goroutine just before main.main.
-	// See runtime·main above.
-	//
-	// On a related note, it is also possible that the scvg == nil case is
-	// wrong and should include gwait, but that does not happen in
-	// standard Go programs, which all start the scavenger.
-	//
-	if((scvg == nil && runtime·sched.grunning == 0) ||
-	   (scvg != nil && runtime·sched.grunning == 1 && runtime·sched.gwait == 0 &&
-	    (scvg->status == Grunning || scvg->status == Gsyscall))) {
-		runtime·throw("all goroutines are asleep - deadlock!");
-	}
-
-	m->nextg = nil;
-	m->waitnextg = 1;
-	runtime·noteclear(&m->havenextg);
-
-	// Stoptheworld is waiting for all but its cpu to go to stop.
-	// Entersyscall might have decremented mcpu too, but if so
-	// it will see the waitstop and take the slow path.
-	// Exitsyscall never increments mcpu beyond mcpumax.
-	v = runtime·atomicload(&runtime·sched.atomic);
-	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
-		// set waitstop = 0 (known to be 1)
-		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
-		runtime·notewakeup(&runtime·sched.stopped);
-	}
-	schedunlock();
-
-	runtime·notesleep(&m->havenextg);
-	if(m->helpgc) {
-		runtime·gchelper();
-		m->helpgc = 0;
-		runtime·lock(&runtime·sched);
-		goto top;
-	}
-	if((gp = m->nextg) == nil)
-		runtime·throw("bad m->nextg in nextgoroutine");
-	m->nextg = nil;
-	return gp;
+	return mp;
 }
 
-int32
-runtime·helpgc(bool *extra)
+static M* lockextra(bool nilokay);
+static void unlockextra(M*);
+
+// needm is called when a cgo callback happens on a
+// thread without an m (a thread not created by Go).
+// In this case, needm is expected to find an m to use
+// and return with m, g initialized correctly.
+// Since m and g are not set now (likely nil, but see below)
+// needm is limited in what routines it can call. In particular
+// it can only call nosplit functions (textflag 7) and cannot
+// do any scheduling that requires an m.
+//
+// In order to avoid needing heavy lifting here, we adopt
+// the following strategy: there is a stack of available m's
+// that can be stolen. Using compare-and-swap
+// to pop from the stack has ABA races, so we simulate
+// a lock by doing an exchange (via casp) to steal the stack
+// head and replace the top pointer with MLOCKED (1).
+// This serves as a simple spin lock that we can use even
+// without an m. The thread that locks the stack in this way
+// unlocks the stack by storing a valid stack head pointer.
+//
+// In order to make sure that there is always an m structure
+// available to be stolen, we maintain the invariant that there
+// is always one more than needed. At the beginning of the
+// program (if cgo is in use) the list is seeded with a single m.
+// If needm finds that it has taken the last m off the list, its job
+// is - once it has installed its own m so that it can do things like
+// allocate memory - to create a spare m and put it on the list.
+//
+// Each of these extra m's also has a g0 and a curg that are
+// pressed into service as the scheduling stack and current
+// goroutine for the duration of the cgo callback.
+//
+// When the callback is done with the m, it calls dropm to
+// put the m back on the list.
+#pragma textflag 7
+void
+runtime·needm(byte x)
 {
 	M *mp;
-	int32 n, max;
 
-	// Figure out how many CPUs to use.
-	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
-	max = runtime·gomaxprocs;
-	if(max > runtime·ncpu)
-		max = runtime·ncpu;
-	if(max > MaxGcproc)
-		max = MaxGcproc;
+	// Lock extra list, take head, unlock popped list.
+	// nilokay=false is safe here because of the invariant above,
+	// that the extra list always contains or will soon contain
+	// at least one m.
+	mp = lockextra(false);
+
+	// Set needextram when we've just emptied the list,
+	// so that the eventual call into cgocallbackg will
+	// allocate a new m for the extra list. We delay the
+	// allocation until then so that it can be done
+	// after exitsyscall makes sure it is okay to be
+	// running at all (that is, there's no garbage collection
+	// running right now).
+	mp->needextram = mp->schedlink == nil;
+	unlockextra(mp->schedlink);
+
+	// Install m and g (= m->g0) and set the stack bounds
+	// to match the current stack. We don't actually know
+	// how big the stack is, like we don't know how big any
+	// scheduling stack is, but we assume there's at least 32 kB,
+	// which is more than enough for us.
+	runtime·setmg(mp, mp->g0);
+	g->stackbase = (uintptr)(&x + 1024);
+	g->stackguard = (uintptr)(&x - 32*1024);
+
+	// On windows/386, we need to put an SEH frame (two words)
+	// somewhere on the current stack. We are called
+	// from needm, and we know there is some available
+	// space one word into the argument frame. Use that.
+	m->seh = (SEH*)((uintptr*)&x + 1);
+
+	// Initialize this thread to use the m.
+	runtime·asminit();
+	runtime·minit();
+}
 
-	// We're going to use one CPU no matter what.
-	// Figure out the max number of additional CPUs.
-	max--;
+// newextram allocates an m and puts it on the extra list.
+// It is called with a working local m, so that it can do things
+// like call schedlock and allocate.
+void
+runtime·newextram(void)
+{
+	M *mp, *mnext;
+	G *gp;
 
+	// Create extra goroutine locked to extra m.
+	// The goroutine is the context in which the cgo callback will run.
+	// The sched.pc will never be returned to, but setting it to
+	// runtime.goexit makes clear to the traceback routines where
+	// the goroutine stack ends.
+	mp = runtime·allocm(nil);
+	gp = runtime·malg(4096);
+	gp->sched.pc = (void*)runtime·goexit;
+	gp->sched.sp = gp->stackbase;
+	gp->sched.g = gp;
+	gp->status = Gsyscall;
+	mp->curg = gp;
+	mp->locked = LockInternal;
+	mp->lockedg = gp;
+	gp->lockedm = mp;
+	// put on allg for garbage collector
 	runtime·lock(&runtime·sched);
-	n = 0;
-	while(n < max && (mp = mget(nil)) != nil) {
-		n++;
-		mp->helpgc = 1;
-		mp->waitnextg = 0;
-		runtime·notewakeup(&mp->havenextg);
-	}
+	if(runtime·lastg == nil)
+		runtime·allg = gp;
+	else
+		runtime·lastg->alllink = gp;
+	runtime·lastg = gp;
 	runtime·unlock(&runtime·sched);
-	if(extra)
-		*extra = n != max;
-	return n;
+	gp->goid = runtime·xadd64(&runtime·sched.goidgen, 1);
+	if(raceenabled)
+		gp->racectx = runtime·racegostart(runtime·newextram);
+
+	// Add m to the extra list.
+	mnext = lockextra(true);
+	mp->schedlink = mnext;
+	unlockextra(mp);
 }
 
+// dropm is called when a cgo callback has called needm but is now
+// done with the callback and returning back into the non-Go thread.
+// It puts the current m back onto the extra list.
+//
+// The main expense here is the call to signalstack to release the
+// m's signal stack, and then the call to needm on the next callback
+// from this thread. It is tempting to try to save the m for next time,
+// which would eliminate both these costs, but there might not be
+// a next time: the current thread (which Go does not control) might exit.
+// If we saved the m for that thread, there would be an m leak each time
+// such a thread exited. Instead, we acquire and release an m on each
+// call. These should typically not be scheduling operations, just a few
+// atomics, so the cost should be small.
+//
+// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+// variable using pthread_key_create. Unlike the pthread keys we already use
+// on OS X, this dummy key would never be read by Go code. It would exist
+// only so that we could register at thread-exit-time destructor.
+// That destructor would put the m back onto the extra list.
+// This is purely a performance optimization. The current version,
+// in which dropm happens on each cgo call, is still correct too.
+// We may have to keep the current version on systems with cgo
+// but without pthreads, like Windows.
 void
-runtime·stoptheworld(void)
+runtime·dropm(void)
 {
-	uint32 v;
+	M *mp, *mnext;
 
-	schedlock();
-	runtime·gcwaiting = 1;
+	// Undo whatever initialization minit did during needm.
+	runtime·unminit();
 
-	setmcpumax(1);
+	// Clear m and g, and return m to the extra list.
+	// After the call to setmg we can only call nosplit functions.
+	mp = m;
+	runtime·setmg(nil, nil);
 
-	// while mcpu > 1
-	for(;;) {
-		v = runtime·sched.atomic;
-		if(atomic_mcpu(v) <= 1)
-			break;
+	mnext = lockextra(true);
+	mp->schedlink = mnext;
+	unlockextra(mp);
+}
 
-		// It would be unsafe for multiple threads to be using
-		// the stopped note at once, but there is only
-		// ever one thread doing garbage collection.
-		runtime·noteclear(&runtime·sched.stopped);
-		if(atomic_waitstop(v))
-			runtime·throw("invalid waitstop");
+#define MLOCKED ((M*)1)
 
-		// atomic { waitstop = 1 }, predicated on mcpu <= 1 check above
-		// still being true.
-		if(!runtime·cas(&runtime·sched.atomic, v, v+(1<<waitstopShift)))
-			continue;
+// lockextra locks the extra list and returns the list head.
+// The caller must unlock the list by storing a new list head
+// to runtime.extram. If nilokay is true, then lockextra will
+// return a nil list head if that's what it finds. If nilokay is false,
+// lockextra will keep waiting until the list head is no longer nil.
+#pragma textflag 7
+static M*
+lockextra(bool nilokay)
+{
+	M *mp;
+	void (*yield)(void);
 
-		schedunlock();
-		runtime·notesleep(&runtime·sched.stopped);
-		schedlock();
+	for(;;) {
+		mp = runtime·atomicloadp(&runtime·extram);
+		if(mp == MLOCKED) {
+			yield = runtime·osyield;
+			yield();
+			continue;
+		}
+		if(mp == nil && !nilokay) {
+			runtime·usleep(1);
+			continue;
+		}
+		if(!runtime·casp(&runtime·extram, mp, MLOCKED)) {
+			yield = runtime·osyield;
+			yield();
+			continue;
+		}
+		break;
 	}
-	runtime·singleproc = runtime·gomaxprocs == 1;
-	schedunlock();
+	return mp;
 }
 
-void
-runtime·starttheworld(bool extra)
+#pragma textflag 7
+static void
+unlockextra(M *mp)
 {
-	M *m;
-
-	schedlock();
-	runtime·gcwaiting = 0;
-	setmcpumax(runtime·gomaxprocs);
-	matchmg();
-	if(extra && canaddmcpu()) {
-		// Start a new m that will (we hope) be idle
-		// and so available to help when the next
-		// garbage collection happens.
-		// canaddmcpu above did mcpu++
-		// (necessary, because m will be doing various
-		// initialization work so is definitely running),
-		// but m is not running a specific goroutine,
-		// so set the helpgc flag as a signal to m's
-		// first schedule(nil) to mcpu-- and grunning--.
-		m = runtime·newm();
-		m->helpgc = 1;
-		runtime·sched.grunning++;
-	}
-	schedunlock();
+	runtime·atomicstorep(&runtime·extram, mp);
 }
 
-// Called to start an M.
-void
-runtime·mstart(void)
+
+// Create a new m.  It will start off with a call to fn, or else the scheduler.
+static void
+newm(void(*fn)(void), P *p)
 {
-	if(g != m->g0)
-		runtime·throw("bad runtime·mstart");
+	M *mp;
 
-	// Record top of stack for use by mcall.
-	// Once we call schedule we're never coming back,
-	// so other calls can reuse this stack space.
-	runtime·gosave(&m->g0->sched);
-	m->g0->sched.pc = (void*)-1;  // make sure it is never used
-	runtime·asminit();
-	runtime·minit();
+	mp = runtime·allocm(p);
+	mp->nextp = p;
+	mp->mstartfn = fn;
 
-	// Install signal handlers; after minit so that minit can
-	// prepare the thread to be able to handle the signals.
-	if(m == &runtime·m0)
-		runtime·initsig();
+	if(runtime·iscgo) {
+		CgoThreadStart ts;
 
-	schedule(nil);
+		if(_cgo_thread_start == nil)
+			runtime·throw("_cgo_thread_start missing");
+		ts.m = mp;
+		ts.g = mp->g0;
+		ts.fn = runtime·mstart;
+		runtime·asmcgocall(_cgo_thread_start, &ts);
+		return;
+	}
+	runtime·newosproc(mp, (byte*)mp->g0->stackbase);
 }
 
-// When running with cgo, we call libcgo_thread_start
-// to start threads for us so that we can play nicely with
-// foreign code.
-void (*libcgo_thread_start)(void*);
+// Stops execution of the current m until new work is available.
+// Returns with acquired P.
+static void
+stopm(void)
+{
+	if(m->locks)
+		runtime·throw("stopm holding locks");
+	if(m->p)
+		runtime·throw("stopm holding p");
+	if(m->spinning) {
+		m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
 
-typedef struct CgoThreadStart CgoThreadStart;
-struct CgoThreadStart
+retry:
+	runtime·lock(&runtime·sched);
+	mput(m);
+	runtime·unlock(&runtime·sched);
+	runtime·notesleep(&m->park);
+	runtime·noteclear(&m->park);
+	if(m->helpgc) {
+		m->helpgc = 0;
+		runtime·gchelper();
+		m->mcache = nil;
+		goto retry;
+	}
+	acquirep(m->nextp);
+	m->nextp = nil;
+}
+
+static void
+mspinning(void)
 {
-	M *m;
-	G *g;
-	void (*fn)(void);
-};
+	m->spinning = true;
+}
 
-// Kick off new m's as needed (up to mcpumax).
-// Sched is locked.
+// Schedules some M to run the p (creates an M if necessary).
+// If p==nil, tries to get an idle P, if no idle P's returns false.
 static void
-matchmg(void)
+startm(P *p, bool spinning)
 {
-	G *gp;
 	M *mp;
+	void (*fn)(void);
 
-	if(m->mallocing || m->gcing)
+	runtime·lock(&runtime·sched);
+	if(p == nil) {
+		p = pidleget();
+		if(p == nil) {
+			runtime·unlock(&runtime·sched);
+			if(spinning)
+				runtime·xadd(&runtime·sched.nmspinning, -1);
+			return;
+		}
+	}
+	mp = mget();
+	runtime·unlock(&runtime·sched);
+	if(mp == nil) {
+		fn = nil;
+		if(spinning)
+			fn = mspinning;
+		newm(fn, p);
 		return;
-
-	while(haveg() && canaddmcpu()) {
-		gp = gget();
-		if(gp == nil)
-			runtime·throw("gget inconsistency");
-
-		// Find the m that will run gp.
-		if((mp = mget(gp)) == nil)
-			mp = runtime·newm();
-		mnextg(mp, gp);
 	}
+	if(mp->spinning)
+		runtime·throw("startm: m is spinning");
+	if(mp->nextp)
+		runtime·throw("startm: m has p");
+	mp->spinning = spinning;
+	mp->nextp = p;
+	runtime·notewakeup(&mp->park);
 }
 
-// Create a new m.  It will start off with a call to runtime·mstart.
-M*
-runtime·newm(void)
+// Hands off P from syscall or locked M.
+static void
+handoffp(P *p)
 {
-	M *m;
+	// if it has local work, start it straight away
+	if(p->runqhead != p->runqtail || runtime·sched.runqsize) {
+		startm(p, false);
+		return;
+	}
+	// no local work, check that there are no spinning/idle M's,
+	// otherwise our help is not required
+	if(runtime·atomicload(&runtime·sched.nmspinning) + runtime·atomicload(&runtime·sched.npidle) == 0 &&  // TODO: fast atomic
+		runtime·cas(&runtime·sched.nmspinning, 0, 1)) {
+		startm(p, true);
+		return;
+	}
+	runtime·lock(&runtime·sched);
+	if(runtime·gcwaiting) {
+		p->status = Pgcstop;
+		if(--runtime·sched.stopwait == 0)
+			runtime·notewakeup(&runtime·sched.stopnote);
+		runtime·unlock(&runtime·sched);
+		return;
+	}
+	if(runtime·sched.runqsize) {
+		runtime·unlock(&runtime·sched);
+		startm(p, false);
+		return;
+	}
+	pidleput(p);
+	runtime·unlock(&runtime·sched);
+}
 
-	m = runtime·malloc(sizeof(M));
-	mcommoninit(m);
+// Tries to add one more P to execute G's.
+// Called when a G is made runnable (newproc, ready).
+static void
+wakep(void)
+{
+	// be conservative about spinning threads
+	if(!runtime·cas(&runtime·sched.nmspinning, 0, 1))
+		return;
+	startm(nil, true);
+}
 
-	if(runtime·iscgo) {
-		CgoThreadStart ts;
+// Stops execution of the current m that is locked to a g until the g is runnable again.
+// Returns with acquired P.
+static void
+stoplockedm(void)
+{
+	P *p;
 
-		if(libcgo_thread_start == nil)
-			runtime·throw("libcgo_thread_start missing");
-		// pthread_create will make us a stack.
-		m->g0 = runtime·malg(-1);
-		ts.m = m;
-		ts.g = m->g0;
-		ts.fn = runtime·mstart;
-		runtime·asmcgocall(libcgo_thread_start, &ts);
-	} else {
-		if(Windows)
-			// windows will layout sched stack on os stack
-			m->g0 = runtime·malg(-1);
-		else
-			m->g0 = runtime·malg(8192);
-		runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart);
+	if(m->lockedg == nil || m->lockedg->lockedm != m)
+		runtime·throw("stoplockedm: inconsistent locking");
+	if(m->p) {
+		// Schedule another M to run this p.
+		p = releasep();
+		handoffp(p);
 	}
+	inclocked(1);
+	// Wait until another thread schedules lockedg again.
+	runtime·notesleep(&m->park);
+	runtime·noteclear(&m->park);
+	if(m->lockedg->status != Grunnable)
+		runtime·throw("stoplockedm: not runnable");
+	acquirep(m->nextp);
+	m->nextp = nil;
+}
+
+// Schedules the locked m to run the locked gp.
+static void
+startlockedm(G *gp)
+{
+	M *mp;
+	P *p;
+
+	mp = gp->lockedm;
+	if(mp == m)
+		runtime·throw("startlockedm: locked to me");
+	if(mp->nextp)
+		runtime·throw("startlockedm: m has p");
+	// directly handoff current P to the locked m
+	inclocked(-1);
+	p = releasep();
+	mp->nextp = p;
+	runtime·notewakeup(&mp->park);
+	stopm();
+}
+
+// Stops the current m for stoptheworld.
+// Returns when the world is restarted.
+static void
+gcstopm(void)
+{
+	P *p;
 
-	return m;
+	if(!runtime·gcwaiting)
+		runtime·throw("gcstopm: not waiting for gc");
+	if(m->spinning) {
+		m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
+	p = releasep();
+	runtime·lock(&runtime·sched);
+	p->status = Pgcstop;
+	if(--runtime·sched.stopwait == 0)
+		runtime·notewakeup(&runtime·sched.stopnote);
+	runtime·unlock(&runtime·sched);
+	stopm();
 }
 
-// One round of scheduler: find a goroutine and run it.
-// The argument is the goroutine that was running before
-// schedule was called, or nil if this is the first call.
+// Schedules gp to run on the current M.
 // Never returns.
 static void
-schedule(G *gp)
+execute(G *gp)
 {
 	int32 hz;
-	uint32 v;
-
-	schedlock();
-	if(gp != nil) {
-		// Just finished running gp.
-		gp->m = nil;
-		runtime·sched.grunning--;
-
-		// atomic { mcpu-- }
-		v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
-		if(atomic_mcpu(v) > maxgomaxprocs)
-			runtime·throw("negative mcpu in scheduler");
-
-		switch(gp->status){
-		case Grunnable:
-		case Gdead:
-			// Shouldn't have been running!
-			runtime·throw("bad gp->status in sched");
-		case Grunning:
-			gp->status = Grunnable;
-			gput(gp);
-			break;
-		case Gmoribund:
-			gp->status = Gdead;
-			if(gp->lockedm) {
-				gp->lockedm = nil;
-				m->lockedg = nil;
-			}
-			gp->idlem = nil;
-			unwindstack(gp, nil);
-			gfput(gp);
-			if(--runtime·sched.gcount == 0)
-				runtime·exit(0);
-			break;
-		}
-		if(gp->readyonstop){
-			gp->readyonstop = 0;
-			readylocked(gp);
-		}
-	} else if(m->helpgc) {
-		// Bootstrap m or new m started by starttheworld.
-		// atomic { mcpu-- }
-		v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
-		if(atomic_mcpu(v) > maxgomaxprocs)
-			runtime·throw("negative mcpu in scheduler");
-		// Compensate for increment in starttheworld().
-		runtime·sched.grunning--;
-		m->helpgc = 0;
-	} else if(m->nextg != nil) {
-		// New m started by matchmg.
-	} else {
-		runtime·throw("invalid m state in scheduler");
-	}
 
-	// Find (or wait for) g to run.  Unlocks runtime·sched.
-	gp = nextgandunlock();
-	gp->readyonstop = 0;
+	if(gp->status != Grunnable) {
+		runtime·printf("execute: bad g status %d\n", gp->status);
+		runtime·throw("execute: bad g status");
+	}
 	gp->status = Grunning;
+	m->p->tick++;
 	m->curg = gp;
 	gp->m = m;
 
@@ -904,27 +964,209 @@ schedule(G *gp)
 	if(m->profilehz != hz)
 		runtime·resetcpuprofiler(hz);
 
-	if(gp->sched.pc == (byte*)runtime·goexit) {	// kickoff
-		runtime·gogocall(&gp->sched, (void(*)(void))gp->entry);
-	}
+	if(gp->sched.pc == (byte*)runtime·goexit)  // kickoff
+		runtime·gogocallfn(&gp->sched, gp->fnstart);
 	runtime·gogo(&gp->sched, 0);
 }
 
-// Enter scheduler.  If g->status is Grunning,
-// re-queues g and runs everyone else who is waiting
-// before running g again.  If g->status is Gmoribund,
-// kills off g.
-// Cannot split stack because it is called from exitsyscall.
-// See comment below.
-#pragma textflag 7
+// Finds a runnable goroutine to execute.
+// Tries to steal from other P's and get g from global queue.
+static G*
+findrunnable(void)
+{
+	G *gp;
+	P *p;
+	int32 i;
+
+top:
+	if(runtime·gcwaiting) {
+		gcstopm();
+		goto top;
+	}
+	// local runq
+	gp = runqget(m->p);
+	if(gp)
+		return gp;
+	// global runq
+	if(runtime·sched.runqsize) {
+		runtime·lock(&runtime·sched);
+		gp = globrunqget(m->p);
+		runtime·unlock(&runtime·sched);
+		if(gp)
+			return gp;
+	}
+	// If number of spinning M's >= number of busy P's, block.
+	// This is necessary to prevent excessive CPU consumption
+	// when GOMAXPROCS>>1 but the program parallelism is low.
+	if(!m->spinning && 2 * runtime·atomicload(&runtime·sched.nmspinning) >= runtime·gomaxprocs - runtime·atomicload(&runtime·sched.npidle))  // TODO: fast atomic
+		goto stop;
+	if(!m->spinning) {
+		m->spinning = true;
+		runtime·xadd(&runtime·sched.nmspinning, 1);
+	}
+	// random steal from other P's
+	for(i = 0; i < 2*runtime·gomaxprocs; i++) {
+		if(runtime·gcwaiting)
+			goto top;
+		p = runtime·allp[runtime·fastrand1()%runtime·gomaxprocs];
+		if(p == m->p)
+			gp = runqget(p);
+		else
+			gp = runqsteal(m->p, p);
+		if(gp)
+			return gp;
+	}
+stop:
+	// return P and block
+	runtime·lock(&runtime·sched);
+	if(runtime·gcwaiting) {
+		runtime·unlock(&runtime·sched);
+		goto top;
+	}
+	if(runtime·sched.runqsize) {
+		gp = globrunqget(m->p);
+		runtime·unlock(&runtime·sched);
+		return gp;
+	}
+	p = releasep();
+	pidleput(p);
+	runtime·unlock(&runtime·sched);
+	if(m->spinning) {
+		m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
+	// check all runqueues once again
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p && p->runqhead != p->runqtail) {
+			runtime·lock(&runtime·sched);
+			p = pidleget();
+			runtime·unlock(&runtime·sched);
+			if(p) {
+				acquirep(p);
+				goto top;
+			}
+			break;
+		}
+	}
+	stopm();
+	goto top;
+}
+
+// One round of scheduler: find a runnable goroutine and execute it.
+// Never returns.
+static void
+schedule(void)
+{
+	G *gp;
+
+	if(m->locks)
+		runtime·throw("schedule: holding locks");
+
+top:
+	if(runtime·gcwaiting) {
+		gcstopm();
+		goto top;
+	}
+
+	gp = runqget(m->p);
+	if(gp == nil)
+		gp = findrunnable();
+
+	if(m->spinning) {
+		m->spinning = false;
+		runtime·xadd(&runtime·sched.nmspinning, -1);
+	}
+
+	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+	// so see if we need to wakeup another M here.
+	if (m->p->runqhead != m->p->runqtail &&
+		runtime·atomicload(&runtime·sched.nmspinning) == 0 &&
+		runtime·atomicload(&runtime·sched.npidle) > 0)  // TODO: fast atomic
+		wakep();
+
+	if(gp->lockedm) {
+		startlockedm(gp);
+		goto top;
+	}
+
+	execute(gp);
+}
+
+// Puts the current goroutine into a waiting state and unlocks the lock.
+// The goroutine can be made runnable again by calling runtime·ready(gp).
+void
+runtime·park(void(*unlockf)(Lock*), Lock *lock, int8 *reason)
+{
+	m->waitlock = lock;
+	m->waitunlockf = unlockf;
+	g->waitreason = reason;
+	runtime·mcall(park0);
+}
+
+// runtime·park continuation on g0.
+static void
+park0(G *gp)
+{
+	gp->status = Gwaiting;
+	gp->m = nil;
+	m->curg = nil;
+	if(m->waitunlockf) {
+		m->waitunlockf(m->waitlock);
+		m->waitunlockf = nil;
+	}
+	if(m->lockedg) {
+		stoplockedm();
+		execute(gp);  // Never returns.
+	}
+	schedule();
+}
+
+// Scheduler yield.
 void
 runtime·gosched(void)
 {
-	if(m->locks != 0)
-		runtime·throw("gosched holding locks");
-	if(g == m->g0)
-		runtime·throw("gosched of g0");
-	runtime·mcall(schedule);
+	runtime·mcall(gosched0);
+}
+
+// runtime·gosched continuation on g0.
+static void
+gosched0(G *gp)
+{
+	gp->status = Grunnable;
+	gp->m = nil;
+	m->curg = nil;
+	runtime·lock(&runtime·sched);
+	globrunqput(gp);
+	runtime·unlock(&runtime·sched);
+	if(m->lockedg) {
+		stoplockedm();
+		execute(gp);  // Never returns.
+	}
+	schedule();
+}
+
+// Finishes execution of the current goroutine.
+void
+runtime·goexit(void)
+{
+	if(raceenabled)
+		runtime·racegoend();
+	runtime·mcall(goexit0);
+}
+
+// runtime·goexit continuation on g0.
+static void
+goexit0(G *gp)
+{
+	gp->status = Gdead;
+	gp->m = nil;
+	gp->lockedm = nil;
+	m->curg = nil;
+	m->lockedg = nil;
+	runtime·unwindstack(gp, nil);
+	gfput(m->p, gp);
+	schedule();
 }
 
 // The goroutine g is about to enter a system call.
@@ -935,21 +1177,19 @@ runtime·gosched(void)
 // Entersyscall cannot split the stack: the runtime·gosave must
 // make g->sched refer to the caller's stack segment, because
 // entersyscall is going to return immediately after.
-// It's okay to call matchmg and notewakeup even after
-// decrementing mcpu, because we haven't released the
-// sched lock yet, so the garbage collector cannot be running.
 #pragma textflag 7
 void
-runtime·entersyscall(void)
+·entersyscall(int32 dummy)
 {
-	uint32 v;
-
 	if(m->profilehz > 0)
 		runtime·setprof(false);
 
 	// Leave SP around for gc and traceback.
-	runtime·gosave(&g->sched);
+	g->sched.sp = (uintptr)runtime·getcallersp(&dummy);
+	g->sched.pc = runtime·getcallerpc(&dummy);
+	g->sched.g = g;
 	g->gcsp = g->sched.sp;
+	g->gcpc = g->sched.pc;
 	g->gcstack = g->stackbase;
 	g->gcguard = g->stackguard;
 	g->status = Gsyscall;
@@ -959,34 +1199,61 @@ runtime·entersyscall(void)
 		runtime·throw("entersyscall");
 	}
 
-	// Fast path.
-	// The slow path inside the schedlock/schedunlock will get
-	// through without stopping if it does:
-	//	mcpu--
-	//	gwait not true
-	//	waitstop && mcpu <= mcpumax not true
-	// If we can do the same with a single atomic add,
-	// then we can skip the locks.
-	v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
-	if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
-		return;
-
-	schedlock();
-	v = runtime·atomicload(&runtime·sched.atomic);
-	if(atomic_gwaiting(v)) {
-		matchmg();
-		v = runtime·atomicload(&runtime·sched.atomic);
+	if(runtime·atomicload(&runtime·sched.sysmonwait)) {  // TODO: fast atomic
+		runtime·lock(&runtime·sched);
+		if(runtime·atomicload(&runtime·sched.sysmonwait)) {
+			runtime·atomicstore(&runtime·sched.sysmonwait, 0);
+			runtime·notewakeup(&runtime·sched.sysmonnote);
+		}
+		runtime·unlock(&runtime·sched);
+		runtime·gosave(&g->sched);  // re-save for traceback
 	}
-	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
-		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
-		runtime·notewakeup(&runtime·sched.stopped);
+
+	m->mcache = nil;
+	m->p->tick++;
+	m->p->m = nil;
+	runtime·atomicstore(&m->p->status, Psyscall);
+	if(runtime·gcwaiting) {
+		runtime·lock(&runtime·sched);
+		if (runtime·sched.stopwait > 0 && runtime·cas(&m->p->status, Psyscall, Pgcstop)) {
+			if(--runtime·sched.stopwait == 0)
+				runtime·notewakeup(&runtime·sched.stopnote);
+		}
+		runtime·unlock(&runtime·sched);
+		runtime·gosave(&g->sched);  // re-save for traceback
 	}
+}
 
-	// Re-save sched in case one of the calls
-	// (notewakeup, matchmg) triggered something using it.
-	runtime·gosave(&g->sched);
+// The same as runtime·entersyscall(), but with a hint that the syscall is blocking.
+#pragma textflag 7
+void
+·entersyscallblock(int32 dummy)
+{
+	P *p;
 
-	schedunlock();
+	if(m->profilehz > 0)
+		runtime·setprof(false);
+
+	// Leave SP around for gc and traceback.
+	g->sched.sp = (uintptr)runtime·getcallersp(&dummy);
+	g->sched.pc = runtime·getcallerpc(&dummy);
+	g->sched.g = g;
+	g->gcsp = g->sched.sp;
+	g->gcpc = g->sched.pc;
+	g->gcstack = g->stackbase;
+	g->gcguard = g->stackguard;
+	g->status = Gsyscall;
+	if(g->gcsp < g->gcguard-StackGuard || g->gcstack < g->gcsp) {
+		// runtime·printf("entersyscallblock inconsistent %p [%p,%p]\n",
+		//	g->gcsp, g->gcguard-StackGuard, g->gcstack);
+		runtime·throw("entersyscallblock");
+	}
+
+	p = releasep();
+	handoffp(p);
+	if(g == scvg)  // do not consider blocked scavenger for deadlock detection
+		inclocked(1);
+	runtime·gosave(&g->sched);  // re-save for traceback
 }
 
 // The goroutine g exited its system call.
@@ -996,177 +1263,81 @@ runtime·entersyscall(void)
 void
 runtime·exitsyscall(void)
 {
-	uint32 v;
+	P *p;
 
-	// Fast path.
-	// If we can do the mcpu++ bookkeeping and
-	// find that we still have mcpu <= mcpumax, then we can
-	// start executing Go code immediately, without having to
-	// schedlock/schedunlock.
-	v = runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
-	if(m->profilehz == runtime·sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) {
+	// Check whether the profiler needs to be turned on.
+	if(m->profilehz > 0)
+		runtime·setprof(true);
+
+	// Try to re-acquire the last P.
+	if(m->p && m->p->status == Psyscall && runtime·cas(&m->p->status, Psyscall, Prunning)) {
 		// There's a cpu for us, so we can run.
+		m->mcache = m->p->mcache;
+		m->p->m = m;
+		m->p->tick++;
 		g->status = Grunning;
 		// Garbage collector isn't running (since we are),
-		// so okay to clear gcstack.
-		g->gcstack = nil;
-
-		if(m->profilehz > 0)
-			runtime·setprof(true);
+		// so okay to clear gcstack and gcsp.
+		g->gcstack = (uintptr)nil;
+		g->gcsp = (uintptr)nil;
 		return;
 	}
 
-	// Tell scheduler to put g back on the run queue:
-	// mostly equivalent to g->status = Grunning,
-	// but keeps the garbage collector from thinking
-	// that g is running right now, which it's not.
-	g->readyonstop = 1;
+	if(g == scvg)  // do not consider blocked scavenger for deadlock detection
+		inclocked(-1);
+	// Try to get any other idle P.
+	m->p = nil;
+	if(runtime·sched.pidle) {
+		runtime·lock(&runtime·sched);
+		p = pidleget();
+		runtime·unlock(&runtime·sched);
+		if(p) {
+			acquirep(p);
+			g->gcstack = (uintptr)nil;
+			g->gcsp = (uintptr)nil;
+			return;
+		}
+	}
 
-	// All the cpus are taken.
-	// The scheduler will ready g and put this m to sleep.
-	// When the scheduler takes g away from m,
-	// it will undo the runtime·sched.mcpu++ above.
-	runtime·gosched();
+	// Call the scheduler.
+	runtime·mcall(exitsyscall0);
 
-	// Gosched returned, so we're allowed to run now.
+	// Scheduler returned, so we're allowed to run now.
 	// Delete the gcstack information that we left for
 	// the garbage collector during the system call.
 	// Must wait until now because until gosched returns
 	// we don't know for sure that the garbage collector
 	// is not running.
-	g->gcstack = nil;
+	g->gcstack = (uintptr)nil;
+	g->gcsp = (uintptr)nil;
 }
 
-// Called from runtime·lessstack when returning from a function which
-// allocated a new stack segment.  The function's return value is in
-// m->cret.
-void
-runtime·oldstack(void)
+// runtime·exitsyscall slow path on g0.
+// Failed to acquire P, enqueue gp as runnable.
+static void
+exitsyscall0(G *gp)
 {
-	Stktop *top, old;
-	uint32 argsize;
-	uintptr cret;
-	byte *sp;
-	G *g1;
-	int32 goid;
-
-//printf("oldstack m->cret=%p\n", m->cret);
-
-	g1 = m->curg;
-	top = (Stktop*)g1->stackbase;
-	sp = (byte*)top;
-	old = *top;
-	argsize = old.argsize;
-	if(argsize > 0) {
-		sp -= argsize;
-		runtime·memmove(top->argp, sp, argsize);
-	}
-	goid = old.gobuf.g->goid;	// fault if g is bad, before gogo
-	USED(goid);
-
-	if(old.free != 0)
-		runtime·stackfree(g1->stackguard - StackGuard, old.free);
-	g1->stackbase = old.stackbase;
-	g1->stackguard = old.stackguard;
-
-	cret = m->cret;
-	m->cret = 0;  // drop reference
-	runtime·gogo(&old.gobuf, cret);
-}
-
-// Called from reflect·call or from runtime·morestack when a new
-// stack segment is needed.  Allocate a new stack big enough for
-// m->moreframesize bytes, copy m->moreargsize bytes to the new frame,
-// and then act as though runtime·lessstack called the function at
-// m->morepc.
-void
-runtime·newstack(void)
-{
-	int32 framesize, argsize;
-	Stktop *top;
-	byte *stk, *sp;
-	G *g1;
-	Gobuf label;
-	bool reflectcall;
-	uintptr free;
-
-	framesize = m->moreframesize;
-	argsize = m->moreargsize;
-	g1 = m->curg;
-
-	if(m->morebuf.sp < g1->stackguard - StackGuard) {
-		runtime·printf("runtime: split stack overflow: %p < %p\n", m->morebuf.sp, g1->stackguard - StackGuard);
-		runtime·throw("runtime: split stack overflow");
-	}
-	if(argsize % sizeof(uintptr) != 0) {
-		runtime·printf("runtime: stack split with misaligned argsize %d\n", argsize);
-		runtime·throw("runtime: stack split argsize");
-	}
-
-	reflectcall = framesize==1;
-	if(reflectcall)
-		framesize = 0;
-
-	if(reflectcall && m->morebuf.sp - sizeof(Stktop) - argsize - 32 > g1->stackguard) {
-		// special case: called from reflect.call (framesize==1)
-		// to call code with an arbitrary argument size,
-		// and we have enough space on the current stack.
-		// the new Stktop* is necessary to unwind, but
-		// we don't need to create a new segment.
-		top = (Stktop*)(m->morebuf.sp - sizeof(*top));
-		stk = g1->stackguard - StackGuard;
-		free = 0;
-	} else {
-		// allocate new segment.
-		framesize += argsize;
-		framesize += StackExtra;	// room for more functions, Stktop.
-		if(framesize < StackMin)
-			framesize = StackMin;
-		framesize += StackSystem;
-		stk = runtime·stackalloc(framesize);
-		top = (Stktop*)(stk+framesize-sizeof(*top));
-		free = framesize;
-	}
-
-//runtime·printf("newstack framesize=%d argsize=%d morepc=%p moreargp=%p gobuf=%p, %p top=%p old=%p\n",
-//framesize, argsize, m->morepc, m->moreargp, m->morebuf.pc, m->morebuf.sp, top, g1->stackbase);
-
-	top->stackbase = g1->stackbase;
-	top->stackguard = g1->stackguard;
-	top->gobuf = m->morebuf;
-	top->argp = m->moreargp;
-	top->argsize = argsize;
-	top->free = free;
-	m->moreargp = nil;
-	m->morebuf.pc = nil;
-	m->morebuf.sp = nil;
-
-	// copy flag from panic
-	top->panic = g1->ispanic;
-	g1->ispanic = false;
-
-	g1->stackbase = (byte*)top;
-	g1->stackguard = stk + StackGuard;
-
-	sp = (byte*)top;
-	if(argsize > 0) {
-		sp -= argsize;
-		runtime·memmove(sp, top->argp, argsize);
+	P *p;
+
+	gp->status = Grunnable;
+	gp->m = nil;
+	m->curg = nil;
+	runtime·lock(&runtime·sched);
+	p = pidleget();
+	if(p == nil)
+		globrunqput(gp);
+	runtime·unlock(&runtime·sched);
+	if(p) {
+		acquirep(p);
+		execute(gp);  // Never returns.
 	}
-	if(thechar == '5') {
-		// caller would have saved its LR below args.
-		sp -= sizeof(void*);
-		*(void**)sp = nil;
+	if(m->lockedg) {
+		// Wait until another thread schedules gp and so m again.
+		stoplockedm();
+		execute(gp);  // Never returns.
 	}
-
-	// Continue as if lessstack had just called m->morepc
-	// (the PC that decided to grow the stack).
-	label.sp = sp;
-	label.pc = (byte*)runtime·lessstack;
-	label.g = m->curg;
-	runtime·gogocall(&label, m->morepc);
-
-	*(int32*)345 = 123;	// never return
+	stopm();
+	schedule();  // Never returns.
 }
 
 // Hook used by runtime·malg to call runtime·stackalloc on the
@@ -1204,10 +1375,10 @@ runtime·malg(int32 stacksize)
 			stk = g->param;
 			g->param = nil;
 		}
-		newg->stack0 = stk;
-		newg->stackguard = stk + StackGuard;
-		newg->stackbase = stk + StackSystem + stacksize - sizeof(Stktop);
-		runtime·memclr(newg->stackbase, sizeof(Stktop));
+		newg->stack0 = (uintptr)stk;
+		newg->stackguard = (uintptr)stk + StackGuard;
+		newg->stackbase = (uintptr)stk + StackSystem + stacksize - sizeof(Stktop);
+		runtime·memclr((byte*)newg->stackbase, sizeof(Stktop));
 	}
 	return newg;
 }
@@ -1221,7 +1392,7 @@ runtime·malg(int32 stacksize)
 // functions that split the stack.
 #pragma textflag 7
 void
-runtime·newproc(int32 siz, byte* fn, ...)
+runtime·newproc(int32 siz, FuncVal* fn, ...)
 {
 	byte *argp;
 
@@ -1237,7 +1408,7 @@ runtime·newproc(int32 siz, byte* fn, ...)
 // address of the go statement that created this.  The new g is put
 // on the queue of g's waiting to run.
 G*
-runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
+runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 {
 	byte *sp;
 	G *newg;
@@ -1254,23 +1425,21 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 	if(siz > StackMin - 1024)
 		runtime·throw("runtime.newproc: function arguments too large for new goroutine");
 
-	schedlock();
-
-	if((newg = gfget()) != nil){
+	if((newg = gfget(m->p)) != nil) {
 		if(newg->stackguard - StackGuard != newg->stack0)
 			runtime·throw("invalid stack in newg");
 	} else {
 		newg = runtime·malg(StackMin);
+		runtime·lock(&runtime·sched);
 		if(runtime·lastg == nil)
 			runtime·allg = newg;
 		else
 			runtime·lastg->alllink = newg;
 		runtime·lastg = newg;
+		runtime·unlock(&runtime·sched);
 	}
-	newg->status = Gwaiting;
-	newg->waitreason = "new goroutine";
 
-	sp = newg->stackbase;
+	sp = (byte*)newg->stackbase;
 	sp -= siz;
 	runtime·memmove(sp, argp, narg);
 	if(thechar == '5') {
@@ -1279,318 +1448,88 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 		*(void**)sp = nil;
 	}
 
-	newg->sched.sp = sp;
+	newg->sched.sp = (uintptr)sp;
 	newg->sched.pc = (byte*)runtime·goexit;
 	newg->sched.g = newg;
-	newg->entry = fn;
+	newg->fnstart = fn;
 	newg->gopc = (uintptr)callerpc;
-
-	runtime·sched.gcount++;
-	runtime·sched.goidgen++;
-	newg->goid = runtime·sched.goidgen;
-
-	newprocreadylocked(newg);
-	schedunlock();
-
+	newg->status = Grunnable;
+	newg->goid = runtime·xadd64(&runtime·sched.goidgen, 1);
+	if(raceenabled)
+		newg->racectx = runtime·racegostart(callerpc);
+	runqput(m->p, newg);
+
+	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0 && fn->fn != runtime·main)  // TODO: fast atomic
+		wakep();
 	return newg;
-//printf(" goid=%d\n", newg->goid);
 }
 
-// Create a new deferred function fn with siz bytes of arguments.
-// The compiler turns a defer statement into a call to this.
-// Cannot split the stack because it assumes that the arguments
-// are available sequentially after &fn; they would not be
-// copied if a stack split occurred.  It's OK for this to call
-// functions that split the stack.
-#pragma textflag 7
-uintptr
-runtime·deferproc(int32 siz, byte* fn, ...)
-{
-	Defer *d;
-
-	d = runtime·malloc(sizeof(*d) + siz - sizeof(d->args));
-	d->fn = fn;
-	d->siz = siz;
-	d->pc = runtime·getcallerpc(&siz);
-	if(thechar == '5')
-		d->argp = (byte*)(&fn+2);  // skip caller's saved link register
-	else
-		d->argp = (byte*)(&fn+1);
-	runtime·memmove(d->args, d->argp, d->siz);
-
-	d->link = g->defer;
-	g->defer = d;
-
-	// deferproc returns 0 normally.
-	// a deferred func that stops a panic
-	// makes the deferproc return 1.
-	// the code the compiler generates always
-	// checks the return value and jumps to the
-	// end of the function if deferproc returns != 0.
-	return 0;
-}
-
-// Run a deferred function if there is one.
-// The compiler inserts a call to this at the end of any
-// function which calls defer.
-// If there is a deferred function, this will call runtime·jmpdefer,
-// which will jump to the deferred function such that it appears
-// to have been called by the caller of deferreturn at the point
-// just before deferreturn was called.  The effect is that deferreturn
-// is called again and again until there are no more deferred functions.
-// Cannot split the stack because we reuse the caller's frame to
-// call the deferred function.
-#pragma textflag 7
-void
-runtime·deferreturn(uintptr arg0)
-{
-	Defer *d;
-	byte *argp, *fn;
-
-	d = g->defer;
-	if(d == nil)
-		return;
-	argp = (byte*)&arg0;
-	if(d->argp != argp)
-		return;
-	runtime·memmove(argp, d->args, d->siz);
-	g->defer = d->link;
-	fn = d->fn;
-	if(!d->nofree)
-		runtime·free(d);
-	runtime·jmpdefer(fn, argp);
-}
-
-// Run all deferred functions for the current goroutine.
+// Put on gfree list.
+// If local list is too long, transfer a batch to the global list.
 static void
-rundefer(void)
+gfput(P *p, G *gp)
 {
-	Defer *d;
-
-	while((d = g->defer) != nil) {
-		g->defer = d->link;
-		reflect·call(d->fn, d->args, d->siz);
-		if(!d->nofree)
-			runtime·free(d);
+	if(gp->stackguard - StackGuard != gp->stack0)
+		runtime·throw("invalid stack in gfput");
+	gp->schedlink = p->gfree;
+	p->gfree = gp;
+	p->gfreecnt++;
+	if(p->gfreecnt >= 64) {
+		runtime·lock(&runtime·sched.gflock);
+		while(p->gfreecnt >= 32) {
+			p->gfreecnt--;
+			gp = p->gfree;
+			p->gfree = gp->schedlink;
+			gp->schedlink = runtime·sched.gfree;
+			runtime·sched.gfree = gp;
+		}
+		runtime·unlock(&runtime·sched.gflock);
 	}
 }
 
-// Free stack frames until we hit the last one
-// or until we find the one that contains the argp.
-static void
-unwindstack(G *gp, byte *sp)
+// Get from gfree list.
+// If local list is empty, grab a batch from global list.
+static G*
+gfget(P *p)
 {
-	Stktop *top;
-	byte *stk;
-
-	// Must be called from a different goroutine, usually m->g0.
-	if(g == gp)
-		runtime·throw("unwindstack on self");
+	G *gp;
 
-	while((top = (Stktop*)gp->stackbase) != nil && top->stackbase != nil) {
-		stk = gp->stackguard - StackGuard;
-		if(stk <= sp && sp < gp->stackbase)
-			break;
-		gp->stackbase = top->stackbase;
-		gp->stackguard = top->stackguard;
-		if(top->free != 0)
-			runtime·stackfree(stk, top->free);
+retry:
+	gp = p->gfree;
+	if(gp == nil && runtime·sched.gfree) {
+		runtime·lock(&runtime·sched.gflock);
+		while(p->gfreecnt < 32 && runtime·sched.gfree) {
+			p->gfreecnt++;
+			gp = runtime·sched.gfree;
+			runtime·sched.gfree = gp->schedlink;
+			gp->schedlink = p->gfree;
+			p->gfree = gp;
+		}
+		runtime·unlock(&runtime·sched.gflock);
+		goto retry;
 	}
-
-	if(sp != nil && (sp < gp->stackguard - StackGuard || gp->stackbase < sp)) {
-		runtime·printf("recover: %p not in [%p, %p]\n", sp, gp->stackguard - StackGuard, gp->stackbase);
-		runtime·throw("bad unwindstack");
+	if(gp) {
+		p->gfree = gp->schedlink;
+		p->gfreecnt--;
 	}
+	return gp;
 }
 
-// Print all currently active panics.  Used when crashing.
+// Purge all cached G's from gfree list to the global list.
 static void
-printpanics(Panic *p)
-{
-	if(p->link) {
-		printpanics(p->link);
-		runtime·printf("\t");
-	}
-	runtime·printf("panic: ");
-	runtime·printany(p->arg);
-	if(p->recovered)
-		runtime·printf(" [recovered]");
-	runtime·printf("\n");
-}
-
-static void recovery(G*);
-
-// The implementation of the predeclared function panic.
-void
-runtime·panic(Eface e)
+gfpurge(P *p)
 {
-	Defer *d;
-	Panic *p;
-
-	p = runtime·mal(sizeof *p);
-	p->arg = e;
-	p->link = g->panic;
-	p->stackbase = g->stackbase;
-	g->panic = p;
+	G *gp;
 
-	for(;;) {
-		d = g->defer;
-		if(d == nil)
-			break;
-		// take defer off list in case of recursive panic
-		g->defer = d->link;
-		g->ispanic = true;	// rock for newstack, where reflect.call ends up
-		reflect·call(d->fn, d->args, d->siz);
-		if(p->recovered) {
-			g->panic = p->link;
-			if(g->panic == nil)	// must be done with signal
-				g->sig = 0;
-			runtime·free(p);
-			// put recovering defer back on list
-			// for scheduler to find.
-			d->link = g->defer;
-			g->defer = d;
-			runtime·mcall(recovery);
-			runtime·throw("recovery failed"); // mcall should not return
-		}
-		if(!d->nofree)
-			runtime·free(d);
+	runtime·lock(&runtime·sched.gflock);
+	while(p->gfreecnt) {
+		p->gfreecnt--;
+		gp = p->gfree;
+		p->gfree = gp->schedlink;
+		gp->schedlink = runtime·sched.gfree;
+		runtime·sched.gfree = gp;
 	}
-
-	// ran out of deferred calls - old-school panic now
-	runtime·startpanic();
-	printpanics(g->panic);
-	runtime·dopanic(0);
-}
-
-// Unwind the stack after a deferred function calls recover
-// after a panic.  Then arrange to continue running as though
-// the caller of the deferred function returned normally.
-static void
-recovery(G *gp)
-{
-	Defer *d;
-
-	// Rewind gp's stack; we're running on m->g0's stack.
-	d = gp->defer;
-	gp->defer = d->link;
-
-	// Unwind to the stack frame with d's arguments in it.
-	unwindstack(gp, d->argp);
-
-	// Make the deferproc for this d return again,
-	// this time returning 1.  The calling function will
-	// jump to the standard return epilogue.
-	// The -2*sizeof(uintptr) makes up for the
-	// two extra words that are on the stack at
-	// each call to deferproc.
-	// (The pc we're returning to does pop pop
-	// before it tests the return value.)
-	// On the arm there are 2 saved LRs mixed in too.
-	if(thechar == '5')
-		gp->sched.sp = (byte*)d->argp - 4*sizeof(uintptr);
-	else
-		gp->sched.sp = (byte*)d->argp - 2*sizeof(uintptr);
-	gp->sched.pc = d->pc;
-	if(!d->nofree)
-		runtime·free(d);
-	runtime·gogo(&gp->sched, 1);
-}
-
-// The implementation of the predeclared function recover.
-// Cannot split the stack because it needs to reliably
-// find the stack segment of its caller.
-#pragma textflag 7
-void
-runtime·recover(byte *argp, Eface ret)
-{
-	Stktop *top, *oldtop;
-	Panic *p;
-
-	// Must be a panic going on.
-	if((p = g->panic) == nil || p->recovered)
-		goto nomatch;
-
-	// Frame must be at the top of the stack segment,
-	// because each deferred call starts a new stack
-	// segment as a side effect of using reflect.call.
-	// (There has to be some way to remember the
-	// variable argument frame size, and the segment
-	// code already takes care of that for us, so we
-	// reuse it.)
-	//
-	// As usual closures complicate things: the fp that
-	// the closure implementation function claims to have
-	// is where the explicit arguments start, after the
-	// implicit pointer arguments and PC slot.
-	// If we're on the first new segment for a closure,
-	// then fp == top - top->args is correct, but if
-	// the closure has its own big argument frame and
-	// allocated a second segment (see below),
-	// the fp is slightly above top - top->args.
-	// That condition can't happen normally though
-	// (stack pointers go down, not up), so we can accept
-	// any fp between top and top - top->args as
-	// indicating the top of the segment.
-	top = (Stktop*)g->stackbase;
-	if(argp < (byte*)top - top->argsize || (byte*)top < argp)
-		goto nomatch;
-
-	// The deferred call makes a new segment big enough
-	// for the argument frame but not necessarily big
-	// enough for the function's local frame (size unknown
-	// at the time of the call), so the function might have
-	// made its own segment immediately.  If that's the
-	// case, back top up to the older one, the one that
-	// reflect.call would have made for the panic.
-	//
-	// The fp comparison here checks that the argument
-	// frame that was copied during the split (the top->args
-	// bytes above top->fp) abuts the old top of stack.
-	// This is a correct test for both closure and non-closure code.
-	oldtop = (Stktop*)top->stackbase;
-	if(oldtop != nil && top->argp == (byte*)oldtop - top->argsize)
-		top = oldtop;
-
-	// Now we have the segment that was created to
-	// run this call.  It must have been marked as a panic segment.
-	if(!top->panic)
-		goto nomatch;
-
-	// Okay, this is the top frame of a deferred call
-	// in response to a panic.  It can see the panic argument.
-	p->recovered = 1;
-	ret = p->arg;
-	FLUSH(&ret);
-	return;
-
-nomatch:
-	ret.type = nil;
-	ret.data = nil;
-	FLUSH(&ret);
-}
-
-
-// Put on gfree list.  Sched must be locked.
-static void
-gfput(G *g)
-{
-	if(g->stackguard - StackGuard != g->stack0)
-		runtime·throw("invalid stack in gfput");
-	g->schedlink = runtime·sched.gfree;
-	runtime·sched.gfree = g;
-}
-
-// Get from gfree list.  Sched must be locked.
-static G*
-gfget(void)
-{
-	G *g;
-
-	g = runtime·sched.gfree;
-	if(g)
-		runtime·sched.gfree = g->schedlink;
-	return g;
+	runtime·unlock(&runtime·sched.gflock);
 }
 
 void
@@ -1600,80 +1539,85 @@ runtime·Breakpoint(void)
 }
 
 void
-runtime·Goexit(void)
-{
-	rundefer();
-	runtime·goexit();
-}
-
-void
 runtime·Gosched(void)
 {
 	runtime·gosched();
 }
 
 // Implementation of runtime.GOMAXPROCS.
-// delete when scheduler is stronger
+// delete when scheduler is even stronger
 int32
 runtime·gomaxprocsfunc(int32 n)
 {
 	int32 ret;
-	uint32 v;
 
-	schedlock();
+	if(n > MaxGomaxprocs)
+		n = MaxGomaxprocs;
+	runtime·lock(&runtime·sched);
 	ret = runtime·gomaxprocs;
-	if(n <= 0)
-		n = ret;
-	if(n > maxgomaxprocs)
-		n = maxgomaxprocs;
-	runtime·gomaxprocs = n;
-	if(runtime·gomaxprocs > 1)
-		runtime·singleproc = false;
- 	if(runtime·gcwaiting != 0) {
- 		if(atomic_mcpumax(runtime·sched.atomic) != 1)
- 			runtime·throw("invalid mcpumax during gc");
-		schedunlock();
+	if(n <= 0 || n == ret) {
+		runtime·unlock(&runtime·sched);
 		return ret;
 	}
+	runtime·unlock(&runtime·sched);
 
-	setmcpumax(n);
+	runtime·semacquire(&runtime·worldsema);
+	m->gcing = 1;
+	runtime·stoptheworld();
+	newprocs = n;
+	m->gcing = 0;
+	runtime·semrelease(&runtime·worldsema);
+	runtime·starttheworld();
 
-	// If there are now fewer allowed procs
-	// than procs running, stop.
-	v = runtime·atomicload(&runtime·sched.atomic);
-	if(atomic_mcpu(v) > n) {
-		schedunlock();
-		runtime·gosched();
-		return ret;
-	}
-	// handle more procs
-	matchmg();
-	schedunlock();
 	return ret;
 }
 
-void
-runtime·LockOSThread(void)
+static void
+LockOSThread(void)
 {
-	if(m == &runtime·m0 && runtime·sched.init) {
-		runtime·sched.lockmain = true;
-		return;
-	}
 	m->lockedg = g;
 	g->lockedm = m;
 }
 
 void
-runtime·UnlockOSThread(void)
+runtime·LockOSThread(void)
 {
-	if(m == &runtime·m0 && runtime·sched.init) {
-		runtime·sched.lockmain = false;
+	m->locked |= LockExternal;
+	LockOSThread();
+}
+
+void
+runtime·lockOSThread(void)
+{
+	m->locked += LockInternal;
+	LockOSThread();
+}
+
+static void
+UnlockOSThread(void)
+{
+	if(m->locked != 0)
 		return;
-	}
 	m->lockedg = nil;
 	g->lockedm = nil;
 }
 
+void
+runtime·UnlockOSThread(void)
+{
+	m->locked &= ~LockExternal;
+	UnlockOSThread();
+}
+
+void
+runtime·unlockOSThread(void)
+{
+	if(m->locked < LockInternal)
+		runtime·throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
+	m->locked -= LockInternal;
+	UnlockOSThread();
+}
+
 bool
 runtime·lockedOSThread(void)
 {
@@ -1697,16 +1641,31 @@ runtime·mid(uint32 ret)
 }
 
 void
-runtime·NumGoroutine(int32 ret)
+runtime·NumGoroutine(intgo ret)
 {
-	ret = runtime·sched.gcount;
+	ret = runtime·gcount();
 	FLUSH(&ret);
 }
 
 int32
 runtime·gcount(void)
 {
-	return runtime·sched.gcount;
+	G *gp;
+	int32 n, s;
+
+	n = 0;
+	runtime·lock(&runtime·sched);
+	// TODO(dvyukov): runtime.NumGoroutine() is O(N).
+	// We do not want to increment/decrement centralized counter in newproc/goexit,
+	// just to make runtime.NumGoroutine() faster.
+	// Compromise solution is to introduce per-P counters of active goroutines.
+	for(gp = runtime·allg; gp; gp = gp->alllink) {
+		s = gp->status;
+		if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
+			n++;
+	}
+	runtime·unlock(&runtime·sched);
+	return n;
 }
 
 int32
@@ -1740,6 +1699,9 @@ runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp)
 {
 	int32 n;
 
+	// Windows does profiling in a dedicated thread w/o m.
+	if(!Windows && (m == nil || m->mcache == nil))
+		return;
 	if(prof.fn == nil || prof.hz == 0)
 		return;
 
@@ -1783,27 +1745,533 @@ runtime·setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 		runtime·resetcpuprofiler(hz);
 }
 
-void (*libcgo_setenv)(byte**);
+// Change number of processors.  The world is stopped, sched is locked.
+static void
+procresize(int32 new)
+{
+	int32 i, old;
+	G *gp;
+	P *p;
+
+	old = runtime·gomaxprocs;
+	if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
+		runtime·throw("procresize: invalid arg");
+	// initialize new P's
+	for(i = 0; i < new; i++) {
+		p = runtime·allp[i];
+		if(p == nil) {
+			p = (P*)runtime·mallocgc(sizeof(*p), 0, 0, 1);
+			p->status = Pgcstop;
+			runtime·atomicstorep(&runtime·allp[i], p);
+		}
+		if(p->mcache == nil) {
+			if(old==0 && i==0)
+				p->mcache = m->mcache;  // bootstrap
+			else
+				p->mcache = runtime·allocmcache();
+		}
+		if(p->runq == nil) {
+			p->runqsize = 128;
+			p->runq = (G**)runtime·mallocgc(p->runqsize*sizeof(G*), 0, 0, 1);
+		}
+	}
+
+	// redistribute runnable G's evenly
+	for(i = 0; i < old; i++) {
+		p = runtime·allp[i];
+		while(gp = runqget(p))
+			globrunqput(gp);
+	}
+	// start at 1 because current M already executes some G and will acquire allp[0] below,
+	// so if we have a spare G we want to put it into allp[1].
+	for(i = 1; runtime·sched.runqhead; i++) {
+		gp = runtime·sched.runqhead;
+		runtime·sched.runqhead = gp->schedlink;
+		runqput(runtime·allp[i%new], gp);
+	}
+	runtime·sched.runqtail = nil;
+	runtime·sched.runqsize = 0;
+
+	// free unused P's
+	for(i = new; i < old; i++) {
+		p = runtime·allp[i];
+		runtime·freemcache(p->mcache);
+		p->mcache = nil;
+		gfpurge(p);
+		p->status = Pdead;
+		// can't free P itself because it can be referenced by an M in syscall
+	}
+
+	if(m->p)
+		m->p->m = nil;
+	m->p = nil;
+	m->mcache = nil;
+	p = runtime·allp[0];
+	p->m = nil;
+	p->status = Pidle;
+	acquirep(p);
+	for(i = new-1; i > 0; i--) {
+		p = runtime·allp[i];
+		p->status = Pidle;
+		pidleput(p);
+	}
+	runtime·singleproc = new == 1;
+	runtime·atomicstore((uint32*)&runtime·gomaxprocs, new);
+}
+
+// Associate p and the current m.
+static void
+acquirep(P *p)
+{
+	if(m->p || m->mcache)
+		runtime·throw("acquirep: already in go");
+	if(p->m || p->status != Pidle) {
+		runtime·printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
+		runtime·throw("acquirep: invalid p state");
+	}
+	m->mcache = p->mcache;
+	m->p = p;
+	p->m = m;
+	p->status = Prunning;
+}
 
-// Update the C environment if cgo is loaded.
-// Called from syscall.Setenv.
-void
-syscall·setenv_c(String k, String v)
+// Disassociate p and the current m.
+static P*
+releasep(void)
 {
-	byte *arg[2];
+	P *p;
+
+	if(m->p == nil || m->mcache == nil)
+		runtime·throw("releasep: invalid arg");
+	p = m->p;
+	if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
+		runtime·printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
+			m, m->p, p->m, m->mcache, p->mcache, p->status);
+		runtime·throw("releasep: invalid p state");
+	}
+	m->p = nil;
+	m->mcache = nil;
+	p->m = nil;
+	p->status = Pidle;
+	return p;
+}
 
-	if(libcgo_setenv == nil)
+static void
+inclocked(int32 v)
+{
+	runtime·lock(&runtime·sched);
+	runtime·sched.mlocked += v;
+	if(v > 0)
+		checkdead();
+	runtime·unlock(&runtime·sched);
+}
+
+// Check for deadlock situation.
+// The check is based on number of running M's, if 0 -> deadlock.
+static void
+checkdead(void)
+{
+	G *gp;
+	int32 run, grunning, s;
+
+	// -1 for sysmon
+	run = runtime·sched.mcount - runtime·sched.nmidle - runtime·sched.mlocked - 1;
+	if(run > 0)
 		return;
+	if(run < 0) {
+		runtime·printf("checkdead: nmidle=%d mlocked=%d mcount=%d\n",
+			runtime·sched.nmidle, runtime·sched.mlocked, runtime·sched.mcount);
+		runtime·throw("checkdead: inconsistent counts");
+	}
+	grunning = 0;
+	for(gp = runtime·allg; gp; gp = gp->alllink) {
+		if(gp == scvg)
+			continue;
+		s = gp->status;
+		if(s == Gwaiting)
+			grunning++;
+		else if(s == Grunnable || s == Grunning || s == Gsyscall) {
+			runtime·printf("checkdead: find g %D in status %d\n", gp->goid, s);
+			runtime·throw("checkdead: runnable g");
+		}
+	}
+	if(grunning == 0)  // possible if main goroutine calls runtime·Goexit()
+		runtime·exit(0);
+	m->throwing = -1;  // do not dump full stacks
+	runtime·throw("all goroutines are asleep - deadlock!");
+}
+
+static void
+sysmon(void)
+{
+	uint32 idle, delay;
+	uint32 ticks[MaxGomaxprocs];
+
+	idle = 0;  // how many cycles in succession we had not wokeup somebody
+	delay = 0;
+	for(;;) {
+		if(idle == 0)  // start with 20us sleep...
+			delay = 20;
+		else if(idle > 50)  // start doubling the sleep after 1ms...
+			delay *= 2;
+		if(delay > 10*1000)  // up to 10ms
+			delay = 10*1000;
+		runtime·usleep(delay);
+		if(runtime·gcwaiting || runtime·atomicload(&runtime·sched.npidle) == runtime·gomaxprocs) {  // TODO: fast atomic
+			runtime·lock(&runtime·sched);
+			if(runtime·atomicload(&runtime·gcwaiting) || runtime·atomicload(&runtime·sched.npidle) == runtime·gomaxprocs) {
+				runtime·atomicstore(&runtime·sched.sysmonwait, 1);
+				runtime·unlock(&runtime·sched);
+				runtime·notesleep(&runtime·sched.sysmonnote);
+				runtime·noteclear(&runtime·sched.sysmonnote);
+				idle = 0;
+				delay = 20;
+			} else
+				runtime·unlock(&runtime·sched);
+		}
+		if(retake(ticks))
+			idle = 0;
+		else
+			idle++;
+	}
+}
+
+static uint32
+retake(uint32 *ticks)
+{
+	uint32 i, s, n;
+	int64 t;
+	P *p;
+
+	n = 0;
+	for(i = 0; i < runtime·gomaxprocs; i++) {
+		p = runtime·allp[i];
+		if(p==nil)
+			continue;
+		t = p->tick;
+		if(ticks[i] != t) {
+			ticks[i] = t;
+			continue;
+		}
+		s = p->status;
+		if(s != Psyscall)
+			continue;
+		if(p->runqhead == p->runqtail && runtime·atomicload(&runtime·sched.nmspinning) + runtime·atomicload(&runtime·sched.npidle) > 0)  // TODO: fast atomic
+			continue;
+		// Need to increment number of locked M's before the CAS.
+		// Otherwise the M from which we retake can exit the syscall,
+		// increment nmidle and report deadlock.
+		inclocked(-1);
+		if(runtime·cas(&p->status, s, Pidle)) {
+			n++;
+			handoffp(p);
+		}
+		inclocked(1);
+	}
+	return n;
+}
+
+// Put mp on midle list.
+// Sched must be locked.
+static void
+mput(M *mp)
+{
+	mp->schedlink = runtime·sched.midle;
+	runtime·sched.midle = mp;
+	runtime·sched.nmidle++;
+	checkdead();
+}
+
+// Try to get an m from midle list.
+// Sched must be locked.
+static M*
+mget(void)
+{
+	M *mp;
+
+	if((mp = runtime·sched.midle) != nil){
+		runtime·sched.midle = mp->schedlink;
+		runtime·sched.nmidle--;
+	}
+	return mp;
+}
+
+// Put gp on the global runnable queue.
+// Sched must be locked.
+static void
+globrunqput(G *gp)
+{
+	gp->schedlink = nil;
+	if(runtime·sched.runqtail)
+		runtime·sched.runqtail->schedlink = gp;
+	else
+		runtime·sched.runqhead = gp;
+	runtime·sched.runqtail = gp;
+	runtime·sched.runqsize++;
+}
+
+// Try get a batch of G's from the global runnable queue.
+// Sched must be locked.
+static G*
+globrunqget(P *p)
+{
+	G *gp, *gp1;
+	int32 n;
+
+	if(runtime·sched.runqsize == 0)
+		return nil;
+	n = runtime·sched.runqsize/runtime·gomaxprocs+1;
+	if(n > runtime·sched.runqsize)
+		n = runtime·sched.runqsize;
+	runtime·sched.runqsize -= n;
+	if(runtime·sched.runqsize == 0)
+		runtime·sched.runqtail = nil;
+	gp = runtime·sched.runqhead;
+	runtime·sched.runqhead = gp->schedlink;
+	n--;
+	while(n--) {
+		gp1 = runtime·sched.runqhead;
+		runtime·sched.runqhead = gp1->schedlink;
+		runqput(p, gp1);
+	}
+	return gp;
+}
+
+// Put p to on pidle list.
+// Sched must be locked.
+static void
+pidleput(P *p)
+{
+	p->link = runtime·sched.pidle;
+	runtime·sched.pidle = p;
+	runtime·xadd(&runtime·sched.npidle, 1);  // TODO: fast atomic
+}
+
+// Try get a p from pidle list.
+// Sched must be locked.
+static P*
+pidleget(void)
+{
+	P *p;
 
-	arg[0] = runtime·malloc(k.len + 1);
-	runtime·memmove(arg[0], k.str, k.len);
-	arg[0][k.len] = 0;
+	p = runtime·sched.pidle;
+	if(p) {
+		runtime·sched.pidle = p->link;
+		runtime·xadd(&runtime·sched.npidle, -1);  // TODO: fast atomic
+	}
+	return p;
+}
 
-	arg[1] = runtime·malloc(v.len + 1);
-	runtime·memmove(arg[1], v.str, v.len);
-	arg[1][v.len] = 0;
+// Put g on local runnable queue.
+// TODO(dvyukov): consider using lock-free queue.
+static void
+runqput(P *p, G *gp)
+{
+	int32 h, t, s;
+
+	runtime·lock(p);
+retry:
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	if(t == h-1 || (h == 0 && t == s-1)) {
+		runqgrow(p);
+		goto retry;
+	}
+	p->runq[t++] = gp;
+	if(t == s)
+		t = 0;
+	p->runqtail = t;
+	runtime·unlock(p);
+}
+
+// Get g from local runnable queue.
+static G*
+runqget(P *p)
+{
+	G *gp;
+	int32 t, h, s;
+
+	if(p->runqhead == p->runqtail)
+		return nil;
+	runtime·lock(p);
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	if(t == h) {
+		runtime·unlock(p);
+		return nil;
+	}
+	gp = p->runq[h++];
+	if(h == s)
+		h = 0;
+	p->runqhead = h;
+	runtime·unlock(p);
+	return gp;
+}
 
-	runtime·asmcgocall((void*)libcgo_setenv, arg);
-	runtime·free(arg[0]);
-	runtime·free(arg[1]);
+// Grow local runnable queue.
+// TODO(dvyukov): consider using fixed-size array
+// and transfer excess to the global list (local queue can grow way too big).
+static void
+runqgrow(P *p)
+{
+	G **q;
+	int32 s, t, h, t2;
+
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	t2 = 0;
+	q = runtime·malloc(2*s*sizeof(*q));
+	while(t != h) {
+		q[t2++] = p->runq[h++];
+		if(h == s)
+			h = 0;
+	}
+	runtime·free(p->runq);
+	p->runq = q;
+	p->runqhead = 0;
+	p->runqtail = t2;
+	p->runqsize = 2*s;
 }
+
+// Steal half of elements from local runnable queue of p2
+// and put onto local runnable queue of p.
+// Returns one of the stolen elements (or nil if failed).
+static G*
+runqsteal(P *p, P *p2)
+{
+	G *gp, *gp1;
+	int32 t, h, s, t2, h2, s2, c, i;
+
+	if(p2->runqhead == p2->runqtail)
+		return nil;
+	// sort locks to prevent deadlocks
+	if(p < p2)
+		runtime·lock(p);
+	runtime·lock(p2);
+	if(p2->runqhead == p2->runqtail) {
+		runtime·unlock(p2);
+		if(p < p2)
+			runtime·unlock(p);
+		return nil;
+	}
+	if(p >= p2)
+		runtime·lock(p);
+	// now we've locked both queues and know the victim is not empty
+	h = p->runqhead;
+	t = p->runqtail;
+	s = p->runqsize;
+	h2 = p2->runqhead;
+	t2 = p2->runqtail;
+	s2 = p2->runqsize;
+	gp = p2->runq[h2++];  // return value
+	if(h2 == s2)
+		h2 = 0;
+	// steal roughly half
+	if(t2 > h2)
+		c = (t2 - h2) / 2;
+	else
+		c = (s2 - h2 + t2) / 2;
+	// copy
+	for(i = 0; i != c; i++) {
+		// the target queue is full?
+		if(t == h-1 || (h == 0 && t == s-1))
+			break;
+		// the victim queue is empty?
+		if(t2 == h2)
+			break;
+		gp1 = p2->runq[h2++];
+		if(h2 == s2)
+			h2 = 0;
+		p->runq[t++] = gp1;
+		if(t == s)
+			t = 0;
+	}
+	p->runqtail = t;
+	p2->runqhead = h2;
+	runtime·unlock(p2);
+	runtime·unlock(p);
+	return gp;
+}
+
+void
+runtime·testSchedLocalQueue(void)
+{
+	P p;
+	G gs[1000];
+	int32 i, j;
+
+	runtime·memclr((byte*)&p, sizeof(p));
+	p.runqsize = 1;
+	p.runqhead = 0;
+	p.runqtail = 0;
+	p.runq = runtime·malloc(p.runqsize*sizeof(*p.runq));
+
+	for(i = 0; i < nelem(gs); i++) {
+		if(runqget(&p) != nil)
+			runtime·throw("runq is not empty initially");
+		for(j = 0; j < i; j++)
+			runqput(&p, &gs[i]);
+		for(j = 0; j < i; j++) {
+			if(runqget(&p) != &gs[i]) {
+				runtime·printf("bad element at iter %d/%d\n", i, j);
+				runtime·throw("bad element");
+			}
+		}
+		if(runqget(&p) != nil)
+			runtime·throw("runq is not empty afterwards");
+	}
+}
+
+void
+runtime·testSchedLocalQueueSteal(void)
+{
+	P p1, p2;
+	G gs[1000], *gp;
+	int32 i, j, s;
+
+	runtime·memclr((byte*)&p1, sizeof(p1));
+	p1.runqsize = 1;
+	p1.runqhead = 0;
+	p1.runqtail = 0;
+	p1.runq = runtime·malloc(p1.runqsize*sizeof(*p1.runq));
+
+	runtime·memclr((byte*)&p2, sizeof(p2));
+	p2.runqsize = nelem(gs);
+	p2.runqhead = 0;
+	p2.runqtail = 0;
+	p2.runq = runtime·malloc(p2.runqsize*sizeof(*p2.runq));
+
+	for(i = 0; i < nelem(gs); i++) {
+		for(j = 0; j < i; j++) {
+			gs[j].sig = 0;
+			runqput(&p1, &gs[j]);
+		}
+		gp = runqsteal(&p2, &p1);
+		s = 0;
+		if(gp) {
+			s++;
+			gp->sig++;
+		}
+		while(gp = runqget(&p2)) {
+			s++;
+			gp->sig++;
+		}
+		while(gp = runqget(&p1))
+			gp->sig++;
+		for(j = 0; j < i; j++) {
+			if(gs[j].sig != 1) {
+				runtime·printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
+				runtime·throw("bad element");
+			}
+		}
+		if(s != i/2 && s != i/2+1) {
+			runtime·printf("bad steal %d, want %d or %d, iter %d\n",
+				s, i/2, i/2+1, i);
+			runtime·throw("bad steal");
+		}
+	}
+}
+
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index 32111080a..21fb9c2f7 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -5,9 +5,11 @@
 package runtime_test
 
 import (
+	"math"
 	"runtime"
 	"sync/atomic"
 	"testing"
+	"time"
 )
 
 var stop = make(chan bool, 1)
@@ -22,8 +24,7 @@ func perpetuumMobile() {
 
 func TestStopTheWorldDeadlock(t *testing.T) {
 	if testing.Short() {
-		t.Logf("skipping during short test")
-		return
+		t.Skip("skipping during short test")
 	}
 	maxprocs := runtime.GOMAXPROCS(3)
 	compl := make(chan bool, 2)
@@ -46,6 +47,66 @@ func TestStopTheWorldDeadlock(t *testing.T) {
 	runtime.GOMAXPROCS(maxprocs)
 }
 
+func TestYieldProgress(t *testing.T) {
+	testYieldProgress(t, false)
+}
+
+func TestYieldLockedProgress(t *testing.T) {
+	testYieldProgress(t, true)
+}
+
+func testYieldProgress(t *testing.T, locked bool) {
+	c := make(chan bool)
+	cack := make(chan bool)
+	go func() {
+		if locked {
+			runtime.LockOSThread()
+		}
+		for {
+			select {
+			case <-c:
+				cack <- true
+				return
+			default:
+				runtime.Gosched()
+			}
+		}
+	}()
+	time.Sleep(10 * time.Millisecond)
+	c <- true
+	<-cack
+}
+
+func TestYieldLocked(t *testing.T) {
+	const N = 10
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		for i := 0; i < N; i++ {
+			runtime.Gosched()
+			time.Sleep(time.Millisecond)
+		}
+		c <- true
+		// runtime.UnlockOSThread() is deliberately omitted
+	}()
+	<-c
+}
+
+func TestBlockLocked(t *testing.T) {
+	const N = 10
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		for i := 0; i < N; i++ {
+			c <- true
+		}
+		runtime.UnlockOSThread()
+	}()
+	for i := 0; i < N; i++ {
+		<-c
+	}
+}
+
 func stackGrowthRecursive(i int) {
 	var pad [128]uint64
 	if i != 0 && pad[0] == 0 {
@@ -53,7 +114,15 @@ func stackGrowthRecursive(i int) {
 	}
 }
 
-func BenchmarkStackGrowth(b *testing.B) {
+func TestSchedLocalQueue(t *testing.T) {
+	runtime.TestSchedLocalQueue1()
+}
+
+func TestSchedLocalQueueSteal(t *testing.T) {
+	runtime.TestSchedLocalQueueSteal1()
+}
+
+func benchmarkStackGrowth(b *testing.B, rec int) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)
 	N := int32(b.N / CallsPerSched)
@@ -63,7 +132,7 @@ func BenchmarkStackGrowth(b *testing.B) {
 			for atomic.AddInt32(&N, -1) >= 0 {
 				runtime.Gosched()
 				for g := 0; g < CallsPerSched; g++ {
-					stackGrowthRecursive(10)
+					stackGrowthRecursive(rec)
 				}
 			}
 			c <- true
@@ -74,32 +143,33 @@ func BenchmarkStackGrowth(b *testing.B) {
 	}
 }
 
+func BenchmarkStackGrowth(b *testing.B) {
+	benchmarkStackGrowth(b, 10)
+}
+
+func BenchmarkStackGrowthDeep(b *testing.B) {
+	benchmarkStackGrowth(b, 1024)
+}
+
 func BenchmarkSyscall(b *testing.B) {
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	for p := 0; p < procs; p++ {
-		go func() {
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					runtime.Entersyscall()
-					runtime.Exitsyscall()
-				}
-			}
-			c <- true
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+	benchmarkSyscall(b, 0, 1)
 }
 
 func BenchmarkSyscallWork(b *testing.B) {
+	benchmarkSyscall(b, 100, 1)
+}
+
+func BenchmarkSyscallExcess(b *testing.B) {
+	benchmarkSyscall(b, 0, 4)
+}
+
+func BenchmarkSyscallExcessWork(b *testing.B) {
+	benchmarkSyscall(b, 100, 4)
+}
+
+func benchmarkSyscall(b *testing.B, work, excess int) {
 	const CallsPerSched = 1000
-	const LocalWork = 100
-	procs := runtime.GOMAXPROCS(-1)
+	procs := runtime.GOMAXPROCS(-1) * excess
 	N := int32(b.N / CallsPerSched)
 	c := make(chan bool, procs)
 	for p := 0; p < procs; p++ {
@@ -109,7 +179,7 @@ func BenchmarkSyscallWork(b *testing.B) {
 				runtime.Gosched()
 				for g := 0; g < CallsPerSched; g++ {
 					runtime.Entersyscall()
-					for i := 0; i < LocalWork; i++ {
+					for i := 0; i < work; i++ {
 						foo *= 2
 						foo /= 2
 					}
@@ -123,3 +193,93 @@ func BenchmarkSyscallWork(b *testing.B) {
 		<-c
 	}
 }
+
+func BenchmarkCreateGoroutines(b *testing.B) {
+	benchmarkCreateGoroutines(b, 1)
+}
+
+func BenchmarkCreateGoroutinesParallel(b *testing.B) {
+	benchmarkCreateGoroutines(b, runtime.GOMAXPROCS(-1))
+}
+
+func benchmarkCreateGoroutines(b *testing.B, procs int) {
+	c := make(chan bool)
+	var f func(n int)
+	f = func(n int) {
+		if n == 0 {
+			c <- true
+			return
+		}
+		go f(n - 1)
+	}
+	for i := 0; i < procs; i++ {
+		go f(b.N / procs)
+	}
+	for i := 0; i < procs; i++ {
+		<-c
+	}
+}
+
+type Matrix [][]float64
+
+func BenchmarkMatmult(b *testing.B) {
+	b.StopTimer()
+	// matmult is O(N**3) but testing expects O(b.N),
+	// so we need to take cube root of b.N
+	n := int(math.Cbrt(float64(b.N))) + 1
+	A := makeMatrix(n)
+	B := makeMatrix(n)
+	C := makeMatrix(n)
+	b.StartTimer()
+	matmult(nil, A, B, C, 0, n, 0, n, 0, n, 8)
+}
+
+func makeMatrix(n int) Matrix {
+	m := make(Matrix, n)
+	for i := 0; i < n; i++ {
+		m[i] = make([]float64, n)
+		for j := 0; j < n; j++ {
+			m[i][j] = float64(i*n + j)
+		}
+	}
+	return m
+}
+
+func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, threshold int) {
+	di := i1 - i0
+	dj := j1 - j0
+	dk := k1 - k0
+	if di >= dj && di >= dk && di >= threshold {
+		// divide in two by y axis
+		mi := i0 + di/2
+		done1 := make(chan struct{}, 1)
+		go matmult(done1, A, B, C, i0, mi, j0, j1, k0, k1, threshold)
+		matmult(nil, A, B, C, mi, i1, j0, j1, k0, k1, threshold)
+		<-done1
+	} else if dj >= dk && dj >= threshold {
+		// divide in two by x axis
+		mj := j0 + dj/2
+		done1 := make(chan struct{}, 1)
+		go matmult(done1, A, B, C, i0, i1, j0, mj, k0, k1, threshold)
+		matmult(nil, A, B, C, i0, i1, mj, j1, k0, k1, threshold)
+		<-done1
+	} else if dk >= threshold {
+		// divide in two by "k" axis
+		// deliberately not parallel because of data races
+		mk := k0 + dk/2
+		matmult(nil, A, B, C, i0, i1, j0, j1, k0, mk, threshold)
+		matmult(nil, A, B, C, i0, i1, j0, j1, mk, k1, threshold)
+	} else {
+		// the matrices are small enough, compute directly
+		for i := i0; i < i1; i++ {
+			for j := j0; j < j1; j++ {
+				for k := k0; k < k1; k++ {
+					C[i][j] += A[i][k] * B[k][j]
+				}
+			}
+		}
+	}
+	if done != nil {
+		done <- struct{}{}
+	}
+}
diff --git a/src/pkg/runtime/race.c b/src/pkg/runtime/race.c
new file mode 100644
index 000000000..cfd97041a
--- /dev/null
+++ b/src/pkg/runtime/race.c
@@ -0,0 +1,350 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of the race detector API.
+// +build race
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "race.h"
+
+void runtime∕race·Initialize(uintptr *racectx);
+void runtime∕race·MapShadow(void *addr, uintptr size);
+void runtime∕race·Finalize(void);
+void runtime∕race·FinalizerGoroutine(uintptr racectx);
+void runtime∕race·Read(uintptr racectx, void *addr, void *pc);
+void runtime∕race·Write(uintptr racectx, void *addr, void *pc);
+void runtime∕race·ReadRange(uintptr racectx, void *addr, uintptr sz, uintptr step, void *pc);
+void runtime∕race·WriteRange(uintptr racectx, void *addr, uintptr sz, uintptr step, void *pc);
+void runtime∕race·FuncEnter(uintptr racectx, void *pc);
+void runtime∕race·FuncExit(uintptr racectx);
+void runtime∕race·Malloc(uintptr racectx, void *p, uintptr sz, void *pc);
+void runtime∕race·Free(void *p);
+void runtime∕race·GoStart(uintptr racectx, uintptr *chracectx, void *pc);
+void runtime∕race·GoEnd(uintptr racectx);
+void runtime∕race·Acquire(uintptr racectx, void *addr);
+void runtime∕race·Release(uintptr racectx, void *addr);
+void runtime∕race·ReleaseMerge(uintptr racectx, void *addr);
+
+extern byte noptrdata[];
+extern byte enoptrbss[];
+
+static bool onstack(uintptr argp);
+
+uintptr
+runtime·raceinit(void)
+{
+	uintptr racectx;
+
+	m->racecall = true;
+	runtime∕race·Initialize(&racectx);
+	runtime∕race·MapShadow(noptrdata, enoptrbss - noptrdata);
+	m->racecall = false;
+	return racectx;
+}
+
+void
+runtime·racefini(void)
+{
+	m->racecall = true;
+	runtime∕race·Finalize();
+	m->racecall = false;
+}
+
+void
+runtime·racemapshadow(void *addr, uintptr size)
+{
+	m->racecall = true;
+	runtime∕race·MapShadow(addr, size);
+	m->racecall = false;
+}
+
+// Called from instrumented code.
+// If we split stack, getcallerpc() can return runtime·lessstack().
+#pragma textflag 7
+void
+runtime·racewrite(uintptr addr)
+{
+	if(!onstack(addr)) {
+		m->racecall = true;
+		runtime∕race·Write(g->racectx, (void*)addr, runtime·getcallerpc(&addr));
+		m->racecall = false;
+	}
+}
+
+// Called from instrumented code.
+// If we split stack, getcallerpc() can return runtime·lessstack().
+#pragma textflag 7
+void
+runtime·raceread(uintptr addr)
+{
+	if(!onstack(addr)) {
+		m->racecall = true;
+		runtime∕race·Read(g->racectx, (void*)addr, runtime·getcallerpc(&addr));
+		m->racecall = false;
+	}
+}
+
+// Called from runtime·racefuncenter (assembly).
+#pragma textflag 7
+void
+runtime·racefuncenter1(uintptr pc)
+{
+	// If the caller PC is lessstack, use slower runtime·callers
+	// to walk across the stack split to find the real caller.
+	if(pc == (uintptr)runtime·lessstack)
+		runtime·callers(2, &pc, 1);
+
+	m->racecall = true;
+	runtime∕race·FuncEnter(g->racectx, (void*)pc);
+	m->racecall = false;
+}
+
+// Called from instrumented code.
+#pragma textflag 7
+void
+runtime·racefuncexit(void)
+{
+	m->racecall = true;
+	runtime∕race·FuncExit(g->racectx);
+	m->racecall = false;
+}
+
+void
+runtime·racemalloc(void *p, uintptr sz, void *pc)
+{
+	// use m->curg because runtime·stackalloc() is called from g0
+	if(m->curg == nil)
+		return;
+	m->racecall = true;
+	runtime∕race·Malloc(m->curg->racectx, p, sz, pc);
+	m->racecall = false;
+}
+
+void
+runtime·racefree(void *p)
+{
+	m->racecall = true;
+	runtime∕race·Free(p);
+	m->racecall = false;
+}
+
+uintptr
+runtime·racegostart(void *pc)
+{
+	uintptr racectx;
+
+	m->racecall = true;
+	runtime∕race·GoStart(g->racectx, &racectx, pc);
+	m->racecall = false;
+	return racectx;
+}
+
+void
+runtime·racegoend(void)
+{
+	m->racecall = true;
+	runtime∕race·GoEnd(g->racectx);
+	m->racecall = false;
+}
+
+static void
+memoryaccess(void *addr, uintptr callpc, uintptr pc, bool write)
+{
+	uintptr racectx;
+
+	if(!onstack((uintptr)addr)) {
+		m->racecall = true;
+		racectx = g->racectx;
+		if(callpc) {
+			if(callpc == (uintptr)runtime·lessstack)
+				runtime·callers(3, &callpc, 1);
+			runtime∕race·FuncEnter(racectx, (void*)callpc);
+		}
+		if(write)
+			runtime∕race·Write(racectx, addr, (void*)pc);
+		else
+			runtime∕race·Read(racectx, addr, (void*)pc);
+		if(callpc)
+			runtime∕race·FuncExit(racectx);
+		m->racecall = false;
+	}
+}
+
+void
+runtime·racewritepc(void *addr, void *callpc, void *pc)
+{
+	memoryaccess(addr, (uintptr)callpc, (uintptr)pc, true);
+}
+
+void
+runtime·racereadpc(void *addr, void *callpc, void *pc)
+{
+	memoryaccess(addr, (uintptr)callpc, (uintptr)pc, false);
+}
+
+static void
+rangeaccess(void *addr, uintptr size, uintptr step, uintptr callpc, uintptr pc, bool write)
+{
+	uintptr racectx;
+
+	if(!onstack((uintptr)addr)) {
+		m->racecall = true;
+		racectx = g->racectx;
+		if(callpc) {
+			if(callpc == (uintptr)runtime·lessstack)
+				runtime·callers(3, &callpc, 1);
+			runtime∕race·FuncEnter(racectx, (void*)callpc);
+		}
+		if(write)
+			runtime∕race·WriteRange(racectx, addr, size, step, (void*)pc);
+		else
+			runtime∕race·ReadRange(racectx, addr, size, step, (void*)pc);
+		if(callpc)
+			runtime∕race·FuncExit(racectx);
+		m->racecall = false;
+	}
+}
+
+void
+runtime·racewriterangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc)
+{
+	rangeaccess(addr, sz, step, (uintptr)callpc, (uintptr)pc, true);
+}
+
+void
+runtime·racereadrangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc)
+{
+	rangeaccess(addr, sz, step, (uintptr)callpc, (uintptr)pc, false);
+}
+
+void
+runtime·raceacquire(void *addr)
+{
+	runtime·raceacquireg(g, addr);
+}
+
+void
+runtime·raceacquireg(G *gp, void *addr)
+{
+	if(g->raceignore)
+		return;
+	m->racecall = true;
+	runtime∕race·Acquire(gp->racectx, addr);
+	m->racecall = false;
+}
+
+void
+runtime·racerelease(void *addr)
+{
+	runtime·racereleaseg(g, addr);
+}
+
+void
+runtime·racereleaseg(G *gp, void *addr)
+{
+	if(g->raceignore)
+		return;
+	m->racecall = true;
+	runtime∕race·Release(gp->racectx, addr);
+	m->racecall = false;
+}
+
+void
+runtime·racereleasemerge(void *addr)
+{
+	runtime·racereleasemergeg(g, addr);
+}
+
+void
+runtime·racereleasemergeg(G *gp, void *addr)
+{
+	if(g->raceignore)
+		return;
+	m->racecall = true;
+	runtime∕race·ReleaseMerge(gp->racectx, addr);
+	m->racecall = false;
+}
+
+void
+runtime·racefingo(void)
+{
+	m->racecall = true;
+	runtime∕race·FinalizerGoroutine(g->racectx);
+	m->racecall = false;
+}
+
+// func RaceAcquire(addr unsafe.Pointer)
+void
+runtime·RaceAcquire(void *addr)
+{
+	runtime·raceacquire(addr);
+}
+
+// func RaceRelease(addr unsafe.Pointer)
+void
+runtime·RaceRelease(void *addr)
+{
+	runtime·racerelease(addr);
+}
+
+// func RaceReleaseMerge(addr unsafe.Pointer)
+void
+runtime·RaceReleaseMerge(void *addr)
+{
+	runtime·racereleasemerge(addr);
+}
+
+// func RaceSemacquire(s *uint32)
+void runtime·RaceSemacquire(uint32 *s)
+{
+	runtime·semacquire(s);
+}
+
+// func RaceSemrelease(s *uint32)
+void runtime·RaceSemrelease(uint32 *s)
+{
+	runtime·semrelease(s);
+}
+
+// func RaceRead(addr unsafe.Pointer)
+#pragma textflag 7
+void
+runtime·RaceRead(void *addr)
+{
+	memoryaccess(addr, 0, (uintptr)runtime·getcallerpc(&addr), false);
+}
+
+// func RaceWrite(addr unsafe.Pointer)
+#pragma textflag 7
+void
+runtime·RaceWrite(void *addr)
+{
+	memoryaccess(addr, 0, (uintptr)runtime·getcallerpc(&addr), true);
+}
+
+// func RaceDisable()
+void runtime·RaceDisable(void)
+{
+	g->raceignore++;
+}
+
+// func RaceEnable()
+void runtime·RaceEnable(void)
+{
+	g->raceignore--;
+}
+
+static bool
+onstack(uintptr argp)
+{
+	// noptrdata, data, bss, noptrbss
+	// the layout is in ../../cmd/ld/data.c
+	if((byte*)argp >= noptrdata && (byte*)argp < enoptrbss)
+		return false;
+	if((byte*)argp >= runtime·mheap->arena_start && (byte*)argp < runtime·mheap->arena_used)
+		return false;
+	return true;
+}
diff --git a/src/pkg/runtime/race.go b/src/pkg/runtime/race.go
new file mode 100644
index 000000000..1d64ba389
--- /dev/null
+++ b/src/pkg/runtime/race.go
@@ -0,0 +1,29 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+// Public race detection API, present iff build with -race.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// RaceDisable disables handling of race events in the current goroutine.
+func RaceDisable()
+
+// RaceEnable re-enables handling of race events in the current goroutine.
+func RaceEnable()
+
+func RaceAcquire(addr unsafe.Pointer)
+func RaceRelease(addr unsafe.Pointer)
+func RaceReleaseMerge(addr unsafe.Pointer)
+
+func RaceRead(addr unsafe.Pointer)
+func RaceWrite(addr unsafe.Pointer)
+
+func RaceSemacquire(s *uint32)
+func RaceSemrelease(s *uint32)
diff --git a/src/pkg/runtime/race.h b/src/pkg/runtime/race.h
new file mode 100644
index 000000000..432a8a97d
--- /dev/null
+++ b/src/pkg/runtime/race.h
@@ -0,0 +1,33 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Definitions related to data race detection.
+
+#ifdef RACE
+enum { raceenabled = 1 };
+#else
+enum { raceenabled = 0 };
+#endif
+
+// Initialize race detection subsystem.
+uintptr	runtime·raceinit(void);
+// Finalize race detection subsystem, does not return.
+void	runtime·racefini(void);
+
+void	runtime·racemapshadow(void *addr, uintptr size);
+void	runtime·racemalloc(void *p, uintptr sz, void *pc);
+void	runtime·racefree(void *p);
+uintptr	runtime·racegostart(void *pc);
+void	runtime·racegoend(void);
+void	runtime·racewritepc(void *addr, void *callpc, void *pc);
+void	runtime·racereadpc(void *addr, void *callpc, void *pc);
+void	runtime·racewriterangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc);
+void	runtime·racereadrangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc);
+void	runtime·racefingo(void);
+void	runtime·raceacquire(void *addr);
+void	runtime·raceacquireg(G *gp, void *addr);
+void	runtime·racerelease(void *addr);
+void	runtime·racereleaseg(G *gp, void *addr);
+void	runtime·racereleasemerge(void *addr);
+void	runtime·racereleasemergeg(G *gp, void *addr);
diff --git a/src/pkg/runtime/race/README b/src/pkg/runtime/race/README
new file mode 100644
index 000000000..8bedb09cd
--- /dev/null
+++ b/src/pkg/runtime/race/README
@@ -0,0 +1,11 @@
+runtime/race package contains the data race detector runtime library.
+It is based on ThreadSanitizer race detector, that is currently a part of
+the LLVM project.
+
+To update the .syso files you need to:
+$ svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk
+$ cd compiler-rt/lib/tsan/go
+$ ./buildgo.sh
+
+Tested with gcc 4.6.1 and 4.7.0.  On Windows it's built with 64-bit MinGW.
+
diff --git a/src/pkg/runtime/race/race.go b/src/pkg/runtime/race/race.go
new file mode 100644
index 000000000..b0a5c9a50
--- /dev/null
+++ b/src/pkg/runtime/race/race.go
@@ -0,0 +1,122 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race,linux,amd64 race,darwin,amd64 race,windows,amd64
+
+// Package race provides low-level facilities for data race detection.
+package race
+
+/*
+void __tsan_init(void **racectx);
+void __tsan_fini(void);
+void __tsan_map_shadow(void *addr, void *size);
+void __tsan_go_start(void *racectx, void **chracectx, void *pc);
+void __tsan_go_end(void *racectx);
+void __tsan_read(void *racectx, void *addr, void *pc);
+void __tsan_write(void *racectx, void *addr, void *pc);
+void __tsan_read_range(void *racectx, void *addr, long sz, long step, void *pc);
+void __tsan_write_range(void *racectx, void *addr, long sz, long step, void *pc);
+void __tsan_func_enter(void *racectx, void *pc);
+void __tsan_func_exit(void *racectx);
+void __tsan_malloc(void *racectx, void *p, long sz, void *pc);
+void __tsan_free(void *p);
+void __tsan_acquire(void *racectx, void *addr);
+void __tsan_release(void *racectx, void *addr);
+void __tsan_release_merge(void *racectx, void *addr);
+void __tsan_finalizer_goroutine(void *racectx);
+*/
+import "C"
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+func Initialize(racectx *uintptr) {
+	C.__tsan_init((*unsafe.Pointer)(unsafe.Pointer(racectx)))
+}
+
+func Finalize() {
+	C.__tsan_fini()
+}
+
+func MapShadow(addr, size uintptr) {
+	C.__tsan_map_shadow(unsafe.Pointer(addr), unsafe.Pointer(size))
+}
+
+func FinalizerGoroutine(racectx uintptr) {
+	C.__tsan_finalizer_goroutine(unsafe.Pointer(racectx))
+}
+
+func Read(racectx uintptr, addr, pc uintptr) {
+	C.__tsan_read(unsafe.Pointer(racectx), unsafe.Pointer(addr), unsafe.Pointer(pc))
+}
+
+func Write(racectx uintptr, addr, pc uintptr) {
+	C.__tsan_write(unsafe.Pointer(racectx), unsafe.Pointer(addr), unsafe.Pointer(pc))
+}
+
+func ReadRange(racectx uintptr, addr, sz, step, pc uintptr) {
+	C.__tsan_read_range(unsafe.Pointer(racectx), unsafe.Pointer(addr),
+		C.long(sz), C.long(step), unsafe.Pointer(pc))
+}
+
+func WriteRange(racectx uintptr, addr, sz, step, pc uintptr) {
+	C.__tsan_write_range(unsafe.Pointer(racectx), unsafe.Pointer(addr),
+		C.long(sz), C.long(step), unsafe.Pointer(pc))
+}
+
+func FuncEnter(racectx uintptr, pc uintptr) {
+	C.__tsan_func_enter(unsafe.Pointer(racectx), unsafe.Pointer(pc))
+}
+
+func FuncExit(racectx uintptr) {
+	C.__tsan_func_exit(unsafe.Pointer(racectx))
+}
+
+func Malloc(racectx uintptr, p, sz, pc uintptr) {
+	C.__tsan_malloc(unsafe.Pointer(racectx), unsafe.Pointer(p), C.long(sz), unsafe.Pointer(pc))
+}
+
+func Free(p uintptr) {
+	C.__tsan_free(unsafe.Pointer(p))
+}
+
+func GoStart(racectx uintptr, chracectx *uintptr, pc uintptr) {
+	C.__tsan_go_start(unsafe.Pointer(racectx), (*unsafe.Pointer)(unsafe.Pointer(chracectx)), unsafe.Pointer(pc))
+}
+
+func GoEnd(racectx uintptr) {
+	C.__tsan_go_end(unsafe.Pointer(racectx))
+}
+
+func Acquire(racectx uintptr, addr uintptr) {
+	C.__tsan_acquire(unsafe.Pointer(racectx), unsafe.Pointer(addr))
+}
+
+func Release(racectx uintptr, addr uintptr) {
+	C.__tsan_release(unsafe.Pointer(racectx), unsafe.Pointer(addr))
+}
+
+func ReleaseMerge(racectx uintptr, addr uintptr) {
+	C.__tsan_release_merge(unsafe.Pointer(racectx), unsafe.Pointer(addr))
+}
+
+//export __tsan_symbolize
+func __tsan_symbolize(pc uintptr, fun, file **C.char, line, off *C.int) C.int {
+	f := runtime.FuncForPC(pc)
+	if f == nil {
+		*fun = C.CString("??")
+		*file = C.CString("-")
+		*line = 0
+		*off = C.int(pc)
+		return 1
+	}
+	fi, l := f.FileLine(pc)
+	*fun = C.CString(f.Name())
+	*file = C.CString(fi)
+	*line = C.int(l)
+	*off = C.int(pc - f.Entry())
+	return 1
+}
diff --git a/src/pkg/runtime/race/race_darwin_amd64.syso b/src/pkg/runtime/race/race_darwin_amd64.syso
new file mode 100644
index 000000000..24a00497c
--- /dev/null
+++ b/src/pkg/runtime/race/race_darwin_amd64.syso
diff --git a/src/pkg/runtime/race/race_linux_amd64.syso b/src/pkg/runtime/race/race_linux_amd64.syso
new file mode 100644
index 000000000..b15091ba8
--- /dev/null
+++ b/src/pkg/runtime/race/race_linux_amd64.syso
diff --git a/src/pkg/runtime/race/race_test.go b/src/pkg/runtime/race/race_test.go
new file mode 100644
index 000000000..c77569c37
--- /dev/null
+++ b/src/pkg/runtime/race/race_test.go
@@ -0,0 +1,157 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+// This program is used to verify the race detector
+// by running the tests and parsing their output.
+// It does not check stack correctness, completeness or anything else:
+// it merely verifies that if a test is expected to be racy
+// then the race is detected.
+package race_test
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+var (
+	passedTests = 0
+	totalTests  = 0
+	falsePos    = 0
+	falseNeg    = 0
+	failingPos  = 0
+	failingNeg  = 0
+	failed      = false
+)
+
+const (
+	visibleLen = 40
+	testPrefix = "=== RUN Test"
+)
+
+func TestRace(t *testing.T) {
+	testOutput, err := runTests()
+	if err != nil {
+		t.Fatalf("Failed to run tests: %v\n%v", err, string(testOutput))
+	}
+	reader := bufio.NewReader(bytes.NewBuffer(testOutput))
+
+	funcName := ""
+	var tsanLog []string
+	for {
+		s, err := nextLine(reader)
+		if err != nil {
+			fmt.Printf("%s\n", processLog(funcName, tsanLog))
+			break
+		}
+		if strings.HasPrefix(s, testPrefix) {
+			fmt.Printf("%s\n", processLog(funcName, tsanLog))
+			tsanLog = make([]string, 0, 100)
+			funcName = s[len(testPrefix):]
+		} else {
+			tsanLog = append(tsanLog, s)
+		}
+	}
+
+	fmt.Printf("\nPassed %d of %d tests (%.02f%%, %d+, %d-)\n",
+		passedTests, totalTests, 100*float64(passedTests)/float64(totalTests), falsePos, falseNeg)
+	fmt.Printf("%d expected failures (%d has not fail)\n", failingPos+failingNeg, failingNeg)
+	if failed {
+		t.Fail()
+	}
+}
+
+// nextLine is a wrapper around bufio.Reader.ReadString.
+// It reads a line up to the next '\n' character. Error
+// is non-nil if there are no lines left, and nil
+// otherwise.
+func nextLine(r *bufio.Reader) (string, error) {
+	s, err := r.ReadString('\n')
+	if err != nil {
+		if err != io.EOF {
+			log.Fatalf("nextLine: expected EOF, received %v", err)
+		}
+		return s, err
+	}
+	return s[:len(s)-1], nil
+}
+
+// processLog verifies whether the given ThreadSanitizer's log
+// contains a race report, checks this information against
+// the name of the testcase and returns the result of this
+// comparison.
+func processLog(testName string, tsanLog []string) string {
+	if !strings.HasPrefix(testName, "Race") && !strings.HasPrefix(testName, "NoRace") {
+		return ""
+	}
+	gotRace := false
+	for _, s := range tsanLog {
+		if strings.Contains(s, "DATA RACE") {
+			gotRace = true
+			break
+		}
+	}
+
+	failing := strings.Contains(testName, "Failing")
+	expRace := !strings.HasPrefix(testName, "No")
+	for len(testName) < visibleLen {
+		testName += " "
+	}
+	if expRace == gotRace {
+		passedTests++
+		totalTests++
+		if failing {
+			failed = true
+			failingNeg++
+		}
+		return fmt.Sprintf("%s .", testName)
+	}
+	pos := ""
+	if expRace {
+		falseNeg++
+	} else {
+		falsePos++
+		pos = "+"
+	}
+	if failing {
+		failingPos++
+	} else {
+		failed = true
+	}
+	totalTests++
+	return fmt.Sprintf("%s %s%s", testName, "FAILED", pos)
+}
+
+// runTests assures that the package and its dependencies is
+// built with instrumentation enabled and returns the output of 'go test'
+// which includes possible data race reports from ThreadSanitizer.
+func runTests() ([]byte, error) {
+	tests, err := filepath.Glob("./testdata/*_test.go")
+	if err != nil {
+		return nil, err
+	}
+	args := []string{"test", "-race", "-v"}
+	args = append(args, tests...)
+	cmd := exec.Command("go", args...)
+	// The following flags turn off heuristics that suppress seemingly identical reports.
+	// It is required because the tests contain a lot of data races on the same addresses
+	// (the tests are simple and the memory is constantly reused).
+	for _, env := range os.Environ() {
+		if strings.HasPrefix(env, "GOMAXPROCS=") {
+			continue
+		}
+		cmd.Env = append(cmd.Env, env)
+	}
+	cmd.Env = append(cmd.Env, `GORACE="suppress_equal_stacks=0 suppress_equal_addresses=0 exitcode=0"`)
+	return cmd.CombinedOutput()
+}
diff --git a/src/pkg/runtime/race/race_windows_amd64.syso b/src/pkg/runtime/race/race_windows_amd64.syso
new file mode 100644
index 000000000..0a3a58354
--- /dev/null
+++ b/src/pkg/runtime/race/race_windows_amd64.syso
diff --git a/src/pkg/runtime/race/testdata/atomic_test.go b/src/pkg/runtime/race/testdata/atomic_test.go
new file mode 100644
index 000000000..0c5c2c008
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/atomic_test.go
@@ -0,0 +1,271 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"sync/atomic"
+	"testing"
+	"unsafe"
+)
+
+func TestNoRaceAtomicAddInt64(t *testing.T) {
+	var x1, x2 int8
+	var s int64
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if atomic.AddInt64(&s, 1) == 2 {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if atomic.AddInt64(&s, 1) == 2 {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceAtomicAddInt64(t *testing.T) {
+	var x1, x2 int8
+	var s int64
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if atomic.AddInt64(&s, 1) == 1 {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if atomic.AddInt64(&s, 1) == 1 {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceAtomicAddInt32(t *testing.T) {
+	var x1, x2 int8
+	var s int32
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if atomic.AddInt32(&s, 1) == 2 {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if atomic.AddInt32(&s, 1) == 2 {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceAtomicLoadAddInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		atomic.AddInt32(&s, 1)
+	}()
+	for atomic.LoadInt32(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicLoadStoreInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		atomic.StoreInt32(&s, 1)
+	}()
+	for atomic.LoadInt32(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicStoreCASInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		atomic.StoreInt32(&s, 1)
+	}()
+	for !atomic.CompareAndSwapInt32(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASLoadInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			panic("")
+		}
+	}()
+	for atomic.LoadInt32(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASCASInt32(t *testing.T) {
+	var x int64
+	var s int32
+	go func() {
+		x = 2
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			panic("")
+		}
+	}()
+	for !atomic.CompareAndSwapInt32(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASCASInt32_2(t *testing.T) {
+	var x1, x2 int8
+	var s int32
+	ch := make(chan bool, 2)
+	go func() {
+		x1 = 1
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			x2 = 1
+		}
+		ch <- true
+	}()
+	go func() {
+		x2 = 1
+		if !atomic.CompareAndSwapInt32(&s, 0, 1) {
+			x1 = 1
+		}
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceAtomicLoadInt64(t *testing.T) {
+	var x int32
+	var s int64
+	go func() {
+		x = 2
+		atomic.AddInt64(&s, 1)
+	}()
+	for atomic.LoadInt64(&s) != 1 {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicCASCASUInt64(t *testing.T) {
+	var x int64
+	var s uint64
+	go func() {
+		x = 2
+		if !atomic.CompareAndSwapUint64(&s, 0, 1) {
+			panic("")
+		}
+	}()
+	for !atomic.CompareAndSwapUint64(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicLoadStorePointer(t *testing.T) {
+	var x int64
+	var s unsafe.Pointer
+	var y int = 2
+	var p unsafe.Pointer = unsafe.Pointer(&y)
+	go func() {
+		x = 2
+		atomic.StorePointer(&s, p)
+	}()
+	for atomic.LoadPointer(&s) != p {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+func TestNoRaceAtomicStoreCASUint64(t *testing.T) {
+	var x int64
+	var s uint64
+	go func() {
+		x = 2
+		atomic.StoreUint64(&s, 1)
+	}()
+	for !atomic.CompareAndSwapUint64(&s, 1, 0) {
+		runtime.Gosched()
+	}
+	x = 1
+}
+
+// Races with non-atomic loads are not detected.
+func TestRaceFailingAtomicStoreLoad(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		atomic.StoreUint64(&a, 1)
+		c <- true
+	}()
+	_ = a
+	<-c
+}
+
+func TestRaceAtomicLoadStore(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		_ = atomic.LoadUint64(&a)
+		c <- true
+	}()
+	a = 1
+	<-c
+}
+
+// Races with non-atomic loads are not detected.
+func TestRaceFailingAtomicAddLoad(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		atomic.AddUint64(&a, 1)
+		c <- true
+	}()
+	_ = a
+	<-c
+}
+
+func TestRaceAtomicAddStore(t *testing.T) {
+	c := make(chan bool)
+	var a uint64
+	go func() {
+		atomic.AddUint64(&a, 1)
+		c <- true
+	}()
+	a = 42
+	<-c
+}
diff --git a/src/pkg/runtime/race/testdata/cgo_test.go b/src/pkg/runtime/race/testdata/cgo_test.go
new file mode 100644
index 000000000..ba7e7b562
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/cgo_test.go
@@ -0,0 +1,20 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"os"
+	"os/exec"
+	"testing"
+)
+
+func TestNoRaceCgoSync(t *testing.T) {
+	cmd := exec.Command("go", "run", "-race", "cgo_test_main.go")
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		t.Fatalf("program exited with error: %v\n", err)
+	}
+}
diff --git a/src/pkg/runtime/race/testdata/cgo_test_main.go b/src/pkg/runtime/race/testdata/cgo_test_main.go
new file mode 100644
index 000000000..620cea18b
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/cgo_test_main.go
@@ -0,0 +1,30 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+/*
+int sync;
+
+void Notify(void)
+{
+	__sync_fetch_and_add(&sync, 1);
+}
+
+void Wait(void)
+{
+	while(__sync_fetch_and_add(&sync, 0) == 0) {}
+}
+*/
+import "C"
+
+func main() {
+	data := 0
+	go func() {
+		data = 1
+		C.Notify()
+	}()
+	C.Wait()
+	_ = data
+}
diff --git a/src/pkg/runtime/race/testdata/chan_test.go b/src/pkg/runtime/race/testdata/chan_test.go
new file mode 100644
index 000000000..2332f097e
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/chan_test.go
@@ -0,0 +1,457 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"testing"
+	"time"
+)
+
+func TestNoRaceChanSync(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		c <- 0
+	}()
+	<-c
+	v = 2
+}
+
+func TestNoRaceChanSyncRev(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		c <- 0
+		v = 2
+	}()
+	v = 1
+	<-c
+}
+
+func TestNoRaceChanAsync(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		c <- 0
+	}()
+	<-c
+	v = 2
+}
+
+func TestRaceChanAsyncRev(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		c <- 0
+		v = 1
+	}()
+	v = 2
+	<-c
+}
+
+func TestNoRaceChanAsyncCloseRecv(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+			v = 2
+		}()
+		<-c
+	}()
+}
+
+func TestNoRaceChanAsyncCloseRecv2(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	_, _ = <-c
+	v = 2
+}
+
+func TestNoRaceChanAsyncCloseRecv3(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	for _ = range c {
+	}
+	v = 2
+}
+
+func TestNoRaceChanSyncCloseRecv(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+			v = 2
+		}()
+		<-c
+	}()
+}
+
+func TestNoRaceChanSyncCloseRecv2(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	_, _ = <-c
+	v = 2
+}
+
+func TestNoRaceChanSyncCloseRecv3(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	for _ = range c {
+	}
+	v = 2
+}
+
+func TestRaceChanSyncCloseSend(t *testing.T) {
+	v := 0
+	c := make(chan int)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+		}()
+		c <- 0
+	}()
+	v = 2
+}
+
+func TestRaceChanAsyncCloseSend(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		close(c)
+	}()
+	func() {
+		defer func() {
+			recover()
+		}()
+		for {
+			c <- 0
+		}
+	}()
+	v = 2
+}
+
+func TestRaceChanCloseClose(t *testing.T) {
+	compl := make(chan bool, 2)
+	v1 := 0
+	v2 := 0
+	c := make(chan int)
+	go func() {
+		defer func() {
+			if recover() != nil {
+				v2 = 2
+			}
+			compl <- true
+		}()
+		v1 = 1
+		close(c)
+	}()
+	go func() {
+		defer func() {
+			if recover() != nil {
+				v1 = 2
+			}
+			compl <- true
+		}()
+		v2 = 1
+		close(c)
+	}()
+	<-compl
+	<-compl
+}
+
+func TestRaceChanSendLen(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	go func() {
+		v = 1
+		c <- 1
+	}()
+	for len(c) == 0 {
+		runtime.Gosched()
+	}
+	v = 2
+}
+
+func TestRaceChanRecvLen(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	c <- 1
+	go func() {
+		v = 1
+		<-c
+	}()
+	for len(c) != 0 {
+		runtime.Gosched()
+	}
+	v = 2
+}
+
+func TestRaceChanSendSend(t *testing.T) {
+	compl := make(chan bool, 2)
+	v1 := 0
+	v2 := 0
+	c := make(chan int, 1)
+	go func() {
+		v1 = 1
+		select {
+		case c <- 1:
+		default:
+			v2 = 2
+		}
+		compl <- true
+	}()
+	go func() {
+		v2 = 1
+		select {
+		case c <- 1:
+		default:
+			v1 = 2
+		}
+		compl <- true
+	}()
+	<-compl
+	<-compl
+}
+
+func TestNoRaceChanPtr(t *testing.T) {
+	type msg struct {
+		x int
+	}
+	c := make(chan *msg)
+	go func() {
+		c <- &msg{1}
+	}()
+	m := <-c
+	m.x = 2
+}
+
+func TestRaceChanWrongSend(t *testing.T) {
+	v1 := 0
+	v2 := 0
+	c := make(chan int, 2)
+	go func() {
+		v1 = 1
+		c <- 1
+	}()
+	go func() {
+		v2 = 2
+		c <- 2
+	}()
+	time.Sleep(1e7)
+	if <-c == 1 {
+		v2 = 3
+	} else {
+		v1 = 3
+	}
+}
+
+func TestRaceChanWrongClose(t *testing.T) {
+	v1 := 0
+	v2 := 0
+	c := make(chan int, 1)
+	go func() {
+		defer func() {
+			recover()
+		}()
+		v1 = 1
+		c <- 1
+	}()
+	go func() {
+		time.Sleep(1e7)
+		v2 = 2
+		close(c)
+	}()
+	time.Sleep(2e7)
+	if _, who := <-c; who {
+		v2 = 2
+	} else {
+		v1 = 2
+	}
+}
+
+func TestRaceChanSendClose(t *testing.T) {
+	compl := make(chan bool, 2)
+	c := make(chan int, 1)
+	go func() {
+		defer func() {
+			recover()
+		}()
+		c <- 1
+		compl <- true
+	}()
+	go func() {
+		time.Sleep(1e7)
+		close(c)
+		compl <- true
+	}()
+	<-compl
+	<-compl
+}
+
+func TestNoRaceProducerConsumerUnbuffered(t *testing.T) {
+	type Task struct {
+		f    func()
+		done chan bool
+	}
+
+	queue := make(chan Task)
+
+	go func() {
+		t := <-queue
+		t.f()
+		t.done <- true
+	}()
+
+	doit := func(f func()) {
+		done := make(chan bool, 1)
+		queue <- Task{f, done}
+		<-done
+	}
+
+	x := 0
+	doit(func() {
+		x = 1
+	})
+	_ = x
+}
+
+func TestRaceChanItselfSend(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int, 10)
+	go func() {
+		c <- 0
+		compl <- true
+	}()
+	c = make(chan int, 20)
+	<-compl
+}
+
+func TestRaceChanItselfRecv(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int, 10)
+	c <- 1
+	go func() {
+		<-c
+		compl <- true
+	}()
+	time.Sleep(1e7)
+	c = make(chan int, 20)
+	<-compl
+}
+
+func TestRaceChanItselfNil(t *testing.T) {
+	c := make(chan int, 10)
+	go func() {
+		c <- 0
+	}()
+	time.Sleep(1e7)
+	c = nil
+	_ = c
+}
+
+func TestRaceChanItselfClose(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int)
+	go func() {
+		close(c)
+		compl <- true
+	}()
+	c = make(chan int)
+	<-compl
+}
+
+func TestRaceChanItselfLen(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int)
+	go func() {
+		_ = len(c)
+		compl <- true
+	}()
+	c = make(chan int)
+	<-compl
+}
+
+func TestRaceChanItselfCap(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int)
+	go func() {
+		_ = cap(c)
+		compl <- true
+	}()
+	c = make(chan int)
+	<-compl
+}
+
+func TestRaceChanCloseLen(t *testing.T) {
+	v := 0
+	c := make(chan int, 10)
+	c <- 0
+	go func() {
+		v = 1
+		close(c)
+	}()
+	time.Sleep(1e7)
+	_ = len(c)
+	v = 2
+}
+
+func TestRaceChanSameCell(t *testing.T) {
+	c := make(chan int, 1)
+	v := 0
+	go func() {
+		v = 1
+		c <- 42
+		<-c
+	}()
+	time.Sleep(1e7)
+	c <- 43
+	<-c
+	_ = v
+}
+
+func TestRaceChanCloseSend(t *testing.T) {
+	compl := make(chan bool, 1)
+	c := make(chan int, 10)
+	go func() {
+		close(c)
+		compl <- true
+	}()
+	c <- 0
+	<-compl
+}
diff --git a/src/pkg/runtime/race/testdata/comp_test.go b/src/pkg/runtime/race/testdata/comp_test.go
new file mode 100644
index 000000000..754e4db6d
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/comp_test.go
@@ -0,0 +1,132 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"testing"
+)
+
+type P struct {
+	x, y int
+}
+
+type S struct {
+	s1, s2 P
+}
+
+func TestNoRaceComp(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s2.x = 1
+		c <- true
+	}()
+	s.s2.y = 2
+	<-c
+}
+
+func TestNoRaceComp2(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s1.x = 1
+		c <- true
+	}()
+	s.s1.y = 2
+	<-c
+}
+
+func TestRaceComp(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s2.y = 1
+		c <- true
+	}()
+	s.s2.y = 2
+	<-c
+}
+
+func TestRaceComp2(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s1.x = 1
+		c <- true
+	}()
+	s = S{}
+	<-c
+}
+
+func TestRaceComp3(t *testing.T) {
+	c := make(chan bool, 1)
+	var s S
+	go func() {
+		s.s2.y = 1
+		c <- true
+	}()
+	s = S{}
+	<-c
+}
+
+func TestRaceCompArray(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]S, 10)
+	x := 4
+	go func() {
+		s[x].s2.y = 1
+		c <- true
+	}()
+	x = 5
+	<-c
+}
+
+type Ptr struct {
+	s1, s2 *P
+}
+
+func TestNoRaceCompPtr(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s1.x = 1
+		c <- true
+	}()
+	p.s1.y = 2
+	<-c
+}
+
+func TestNoRaceCompPtr2(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s1.x = 1
+		c <- true
+	}()
+	_ = p
+	<-c
+}
+
+func TestRaceCompPtr(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s2.x = 1
+		c <- true
+	}()
+	p.s2.x = 2
+	<-c
+}
+
+func TestRaceCompPtr2(t *testing.T) {
+	c := make(chan bool, 1)
+	p := Ptr{&P{}, &P{}}
+	go func() {
+		p.s2.x = 1
+		c <- true
+	}()
+	p.s2 = &P{}
+	<-c
+}
diff --git a/src/pkg/runtime/race/testdata/finalizer_test.go b/src/pkg/runtime/race/testdata/finalizer_test.go
new file mode 100644
index 000000000..2b2607689
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/finalizer_test.go
@@ -0,0 +1,67 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceFin(t *testing.T) {
+	c := make(chan bool)
+	go func() {
+		x := new(int)
+		runtime.SetFinalizer(x, func(x *int) {
+			*x = 42
+		})
+		*x = 66
+		c <- true
+	}()
+	<-c
+	runtime.GC()
+	time.Sleep(1e8)
+}
+
+var finVar struct {
+	sync.Mutex
+	cnt int
+}
+
+func TestNoRaceFinGlobal(t *testing.T) {
+	c := make(chan bool)
+	go func() {
+		x := new(int)
+		runtime.SetFinalizer(x, func(x *int) {
+			finVar.Lock()
+			finVar.cnt++
+			finVar.Unlock()
+		})
+		c <- true
+	}()
+	<-c
+	runtime.GC()
+	time.Sleep(1e8)
+	finVar.Lock()
+	finVar.cnt++
+	finVar.Unlock()
+}
+
+func TestRaceFin(t *testing.T) {
+	c := make(chan bool)
+	y := 0
+	go func() {
+		x := new(int)
+		runtime.SetFinalizer(x, func(x *int) {
+			y = 42
+		})
+		c <- true
+	}()
+	<-c
+	runtime.GC()
+	time.Sleep(1e8)
+	y = 66
+}
diff --git a/src/pkg/runtime/race/testdata/io_test.go b/src/pkg/runtime/race/testdata/io_test.go
new file mode 100644
index 000000000..9eb3552dc
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/io_test.go
@@ -0,0 +1,69 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+func TestNoRaceIOFile(t *testing.T) {
+	x := 0
+	path, _ := ioutil.TempDir("", "race_test")
+	fname := filepath.Join(path, "data")
+	go func() {
+		x = 42
+		f, _ := os.Create(fname)
+		f.Write([]byte("done"))
+		f.Close()
+	}()
+	for {
+		f, err := os.Open(fname)
+		if err != nil {
+			time.Sleep(1e6)
+			continue
+		}
+		buf := make([]byte, 100)
+		count, err := f.Read(buf)
+		if count == 0 {
+			time.Sleep(1e6)
+			continue
+		}
+		break
+	}
+	_ = x
+}
+
+func TestNoRaceIOHttp(t *testing.T) {
+	x := 0
+	go func() {
+		http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+			x = 41
+			fmt.Fprintf(w, "test")
+			x = 42
+		})
+		err := http.ListenAndServe(":23651", nil)
+		if err != nil {
+			t.Fatalf("http.ListenAndServe: %v", err)
+		}
+	}()
+	time.Sleep(1e7)
+	x = 1
+	_, err := http.Get("http://127.0.0.1:23651")
+	if err != nil {
+		t.Fatalf("http.Get: %v", err)
+	}
+	x = 2
+	_, err = http.Get("http://127.0.0.1:23651")
+	if err != nil {
+		t.Fatalf("http.Get: %v", err)
+	}
+	x = 3
+}
diff --git a/src/pkg/runtime/race/testdata/map_test.go b/src/pkg/runtime/race/testdata/map_test.go
new file mode 100644
index 000000000..6f86a50b7
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/map_test.go
@@ -0,0 +1,163 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"testing"
+)
+
+func TestRaceMapRW(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_ = m[1]
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestRaceMapRW2(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = m[1]
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestRaceMapRWArray(t *testing.T) {
+	// Check instrumentation of unaddressable arrays (issue 4578).
+	m := make(map[int][2]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_ = m[1][1]
+		ch <- true
+	}()
+	m[2] = [2]int{1, 2}
+	<-ch
+}
+
+func TestNoRaceMapRR(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = m[1]
+		ch <- true
+	}()
+	_ = m[1]
+	<-ch
+}
+
+func TestRaceMapRange(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		for _ = range m {
+		}
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestRaceMapRange2(t *testing.T) {
+	m := make(map[int]int)
+	ch := make(chan bool, 1)
+	go func() {
+		for _ = range m {
+		}
+		ch <- true
+	}()
+	m[1] = 1
+	<-ch
+}
+
+func TestNoRaceMapRangeRange(t *testing.T) {
+	m := make(map[int]int)
+	// now the map is not empty and range triggers an event
+	// should work without this (as in other tests)
+	// so it is suspicious if this test passes and others don't
+	m[0] = 0
+	ch := make(chan bool, 1)
+	go func() {
+		for _ = range m {
+		}
+		ch <- true
+	}()
+	for _ = range m {
+	}
+	<-ch
+}
+
+// Map len is not instrumented.
+func TestRaceFailingMapLen(t *testing.T) {
+	m := make(map[string]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		_ = len(m)
+		ch <- true
+	}()
+	m[""] = true
+	<-ch
+}
+
+func TestRaceMapDelete(t *testing.T) {
+	m := make(map[string]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		delete(m, "")
+		ch <- true
+	}()
+	m[""] = true
+	<-ch
+}
+
+// Map len is not instrumented.
+func TestRaceFailingMapLenDelete(t *testing.T) {
+	m := make(map[string]bool)
+	ch := make(chan bool, 1)
+	go func() {
+		delete(m, "a")
+		ch <- true
+	}()
+	_ = len(m)
+	<-ch
+}
+
+func TestRaceMapVariable(t *testing.T) {
+	ch := make(chan bool, 1)
+	m := make(map[int]int)
+	go func() {
+		m = make(map[int]int)
+		ch <- true
+	}()
+	m = make(map[int]int)
+	<-ch
+}
+
+func TestRaceMapVariable2(t *testing.T) {
+	ch := make(chan bool, 1)
+	m := make(map[int]int)
+	go func() {
+		m[1] = 1
+		ch <- true
+	}()
+	m = make(map[int]int)
+	<-ch
+}
+
+func TestRaceMapVariable3(t *testing.T) {
+	ch := make(chan bool, 1)
+	m := make(map[int]int)
+	go func() {
+		_ = m[1]
+		ch <- true
+	}()
+	m = make(map[int]int)
+	<-ch
+}
diff --git a/src/pkg/runtime/race/testdata/mop_test.go b/src/pkg/runtime/race/testdata/mop_test.go
new file mode 100644
index 000000000..f2daa3730
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/mop_test.go
@@ -0,0 +1,1384 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+	"sync"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+type Point struct {
+	x, y int
+}
+
+type NamedPoint struct {
+	name string
+	p    Point
+}
+
+type DummyWriter struct {
+	state int
+}
+type Writer interface {
+	Write(p []byte) (n int)
+}
+
+func (d DummyWriter) Write(p []byte) (n int) {
+	return 0
+}
+
+var GlobalX, GlobalY int = 0, 0
+var GlobalCh chan int = make(chan int, 2)
+
+func GlobalFunc1() {
+	GlobalY = GlobalX
+	GlobalCh <- 1
+}
+
+func GlobalFunc2() {
+	GlobalX = 1
+	GlobalCh <- 1
+}
+
+func TestRaceIntRWGlobalFuncs(t *testing.T) {
+	go GlobalFunc1()
+	go GlobalFunc2()
+	<-GlobalCh
+	<-GlobalCh
+}
+
+func TestRaceIntRWClosures(t *testing.T) {
+	var x, y int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		x = 1
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceIntRWClosures(t *testing.T) {
+	var x, y int
+	ch := make(chan int, 1)
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	<-ch
+	go func() {
+		x = 1
+		ch <- 1
+	}()
+	<-ch
+
+}
+
+func TestRaceInt32RWClosures(t *testing.T) {
+	var x, y int32
+	ch := make(chan bool, 2)
+
+	go func() {
+		y = x
+		ch <- true
+	}()
+	go func() {
+		x = 1
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceCase(t *testing.T) {
+	var y int
+	for x := -1; x <= 1; x++ {
+		switch {
+		case x < 0:
+			y = -1
+		case x == 0:
+			y = 0
+		case x > 0:
+			y = 1
+		}
+	}
+	y++
+}
+
+func TestRaceCaseCondition(t *testing.T) {
+	var x int = 0
+	ch := make(chan int, 2)
+
+	go func() {
+		x = 2
+		ch <- 1
+	}()
+	go func() {
+		switch x < 2 {
+		case true:
+			x = 1
+			//case false:
+			//	x = 5
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseCondition2(t *testing.T) {
+	// switch body is rearranged by the compiler so the tests
+	// passes even if we don't instrument '<'
+	var x int = 0
+	ch := make(chan int, 2)
+
+	go func() {
+		x = 2
+		ch <- 1
+	}()
+	go func() {
+		switch x < 2 {
+		case true:
+			x = 1
+		case false:
+			x = 5
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseBody(t *testing.T) {
+	var x, y int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		switch {
+		default:
+			x = 1
+		case x == 100:
+			x = -x
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceCaseFallthrough(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+	z = 1
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		switch {
+		case z == 1:
+		case z == 2:
+			x = 2
+		}
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceCaseFallthrough(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+	z = 1
+
+	go func() {
+		y = x
+		ch <- 1
+	}()
+	go func() {
+		switch {
+		case z == 1:
+			fallthrough
+		case z == 2:
+			x = 2
+		}
+		ch <- 1
+	}()
+
+	<-ch
+	<-ch
+}
+
+func TestNoRaceRange(t *testing.T) {
+	ch := make(chan int, 3)
+	a := [...]int{1, 2, 3}
+	for _, v := range a {
+		ch <- v
+	}
+	close(ch)
+}
+
+func TestRaceRange(t *testing.T) {
+	const N = 2
+	var a [N]int
+	var x, y int
+	done := make(chan bool, N)
+	for i, v := range a {
+		go func(i int) {
+			// we don't want a write-vs-write race
+			// so there is no array b here
+			if i == 0 {
+				x = v
+			} else {
+				y = v
+			}
+			done <- true
+		}(i)
+	}
+	for i := 0; i < N; i++ {
+		<-done
+	}
+}
+
+func TestRacePlus(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x + z
+		ch <- 1
+	}()
+	go func() {
+		y = x + z + z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRacePlus2(t *testing.T) {
+	var x, y, z int
+	ch := make(chan int, 2)
+
+	go func() {
+		x = 1
+		ch <- 1
+	}()
+	go func() {
+		y = +x + z
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRacePlus(t *testing.T) {
+	var x, y, z, f int
+	ch := make(chan int, 2)
+
+	go func() {
+		y = x + z
+		ch <- 1
+	}()
+	go func() {
+		f = z + x
+		ch <- 1
+	}()
+	<-ch
+	<-ch
+}
+
+// May crash if the instrumentation is reckless.
+func TestNoRaceEnoughRegisters(t *testing.T) {
+	// from erf.go
+	const (
+		sa1 = 1
+		sa2 = 2
+		sa3 = 3
+		sa4 = 4
+		sa5 = 5
+		sa6 = 6
+		sa7 = 7
+		sa8 = 8
+	)
+	var s, S float64
+	s = 3.1415
+	S = 1 + s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))))
+	s = S
+}
+
+// emptyFunc should not be inlined.
+func emptyFunc(x int) {
+	if false {
+		fmt.Println(x)
+	}
+}
+
+func TestRaceFuncArgument(t *testing.T) {
+	var x int
+	ch := make(chan bool, 1)
+	go func() {
+		emptyFunc(x)
+		ch <- true
+	}()
+	x = 1
+	<-ch
+}
+
+func TestRaceFuncArgument2(t *testing.T) {
+	var x int
+	ch := make(chan bool, 2)
+	go func() {
+		x = 42
+		ch <- true
+	}()
+	go func(y int) {
+		ch <- true
+	}(x)
+	<-ch
+	<-ch
+}
+
+func TestRaceSprint(t *testing.T) {
+	var x int
+	ch := make(chan bool, 1)
+	go func() {
+		fmt.Sprint(x)
+		ch <- true
+	}()
+	x = 1
+	<-ch
+}
+
+// Not implemented.
+func TestRaceFailingArrayCopy(t *testing.T) {
+	ch := make(chan bool, 1)
+	var a [5]int
+	go func() {
+		a[3] = 1
+		ch <- true
+	}()
+	a = [5]int{1, 2, 3, 4, 5}
+	<-ch
+}
+
+func TestRaceStructRW(t *testing.T) {
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p = Point{1, 1}
+		ch <- true
+	}()
+	q := p
+	<-ch
+	p = q
+}
+
+func TestRaceStructFieldRW1(t *testing.T) {
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	_ = p.x
+	<-ch
+}
+
+func TestNoRaceStructFieldRW1(t *testing.T) {
+	// Same struct, different variables, no
+	// pointers. The layout is known (at compile time?) ->
+	// no read on p
+	// writes on x and y
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	p.y = 1
+	<-ch
+	_ = p
+}
+
+func TestNoRaceStructFieldRW2(t *testing.T) {
+	// Same as NoRaceStructFieldRW1
+	// but p is a pointer, so there is a read on p
+	p := Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	p.y = 1
+	<-ch
+	_ = p
+}
+
+func TestRaceStructFieldRW2(t *testing.T) {
+	p := &Point{0, 0}
+	ch := make(chan bool, 1)
+	go func() {
+		p.x = 1
+		ch <- true
+	}()
+	_ = p.x
+	<-ch
+}
+
+func TestRaceStructFieldRW3(t *testing.T) {
+	p := NamedPoint{name: "a", p: Point{0, 0}}
+	ch := make(chan bool, 1)
+	go func() {
+		p.p.x = 1
+		ch <- true
+	}()
+	_ = p.p.x
+	<-ch
+}
+
+func TestRaceEfaceWW(t *testing.T) {
+	var a, b interface{}
+	ch := make(chan bool, 1)
+	go func() {
+		a = 1
+		ch <- true
+	}()
+	a = 2
+	<-ch
+	_, _ = a, b
+}
+
+func TestRaceIfaceWW(t *testing.T) {
+	var a, b Writer
+	ch := make(chan bool, 1)
+	go func() {
+		a = DummyWriter{1}
+		ch <- true
+	}()
+	a = DummyWriter{2}
+	<-ch
+	b = a
+	a = b
+}
+
+func TestRaceEfaceConv(t *testing.T) {
+	c := make(chan bool)
+	v := 0
+	go func() {
+		go func(x interface{}) {
+		}(v)
+		c <- true
+	}()
+	v = 42
+	<-c
+}
+
+type OsFile struct{}
+
+func (*OsFile) Read() {
+}
+
+type IoReader interface {
+	Read()
+}
+
+func TestRaceIfaceConv(t *testing.T) {
+	c := make(chan bool)
+	f := &OsFile{}
+	go func() {
+		go func(x IoReader) {
+		}(f)
+		c <- true
+	}()
+	f = &OsFile{}
+	<-c
+}
+
+func TestRaceError(t *testing.T) {
+	ch := make(chan bool, 1)
+	var err error
+	go func() {
+		err = nil
+		ch <- true
+	}()
+	_ = err
+	<-ch
+}
+
+func TestRaceIntptrRW(t *testing.T) {
+	var x, y int
+	var p *int = &x
+	ch := make(chan bool, 1)
+	go func() {
+		*p = 5
+		ch <- true
+	}()
+	y = *p
+	x = y
+	<-ch
+}
+
+func TestRaceStringRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	s := ""
+	go func() {
+		s = "abacaba"
+		ch <- true
+	}()
+	_ = s
+	<-ch
+}
+
+func TestRaceStringPtrRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	var x string
+	p := &x
+	go func() {
+		*p = "a"
+		ch <- true
+	}()
+	_ = *p
+	<-ch
+}
+
+func TestRaceFloat64WW(t *testing.T) {
+	var x, y float64
+	ch := make(chan bool, 1)
+	go func() {
+		x = 1.0
+		ch <- true
+	}()
+	x = 2.0
+	<-ch
+
+	y = x
+	x = y
+}
+
+func TestRaceComplex128WW(t *testing.T) {
+	var x, y complex128
+	ch := make(chan bool, 1)
+	go func() {
+		x = 2 + 2i
+		ch <- true
+	}()
+	x = 4 + 4i
+	<-ch
+
+	y = x
+	x = y
+}
+
+func TestRaceUnsafePtrRW(t *testing.T) {
+	var x, y, z int
+	x, y, z = 1, 2, 3
+	var p unsafe.Pointer = unsafe.Pointer(&x)
+	ch := make(chan bool, 1)
+	go func() {
+		p = (unsafe.Pointer)(&z)
+		ch <- true
+	}()
+	y = *(*int)(p)
+	x = y
+	<-ch
+}
+
+func TestRaceFuncVariableRW(t *testing.T) {
+	var f func(x int) int
+	f = func(x int) int {
+		return x * x
+	}
+	ch := make(chan bool, 1)
+	go func() {
+		f = func(x int) int {
+			return x
+		}
+		ch <- true
+	}()
+	y := f(1)
+	<-ch
+	x := y
+	y = x
+}
+
+func TestRaceFuncVariableWW(t *testing.T) {
+	var f func(x int) int
+	ch := make(chan bool, 1)
+	go func() {
+		f = func(x int) int {
+			return x
+		}
+		ch <- true
+	}()
+	f = func(x int) int {
+		return x * x
+	}
+	<-ch
+}
+
+// This one should not belong to mop_test
+func TestRacePanic(t *testing.T) {
+	var x int
+	var zero int = 0
+	ch := make(chan bool, 2)
+	go func() {
+		defer func() {
+			err := recover()
+			if err == nil {
+				panic("should be panicking")
+			}
+			x = 1
+			ch <- true
+		}()
+		var y int = 1 / zero
+		zero = y
+	}()
+	go func() {
+		defer func() {
+			err := recover()
+			if err == nil {
+				panic("should be panicking")
+			}
+			x = 2
+			ch <- true
+		}()
+		var y int = 1 / zero
+		zero = y
+	}()
+
+	<-ch
+	<-ch
+	if zero != 0 {
+		panic("zero has changed")
+	}
+}
+
+func TestNoRaceBlank(t *testing.T) {
+	var a [5]int
+	ch := make(chan bool, 1)
+	go func() {
+		_, _ = a[0], a[1]
+		ch <- true
+	}()
+	_, _ = a[2], a[3]
+	<-ch
+	a[1] = a[0]
+}
+
+func TestRaceAppendRW(t *testing.T) {
+	a := make([]int, 10)
+	ch := make(chan bool)
+	go func() {
+		_ = append(a, 1)
+		ch <- true
+	}()
+	a[0] = 1
+	<-ch
+}
+
+func TestRaceAppendLenRW(t *testing.T) {
+	a := make([]int, 0)
+	ch := make(chan bool)
+	go func() {
+		a = append(a, 1)
+		ch <- true
+	}()
+	_ = len(a)
+	<-ch
+}
+
+func TestRaceAppendCapRW(t *testing.T) {
+	a := make([]int, 0)
+	ch := make(chan string)
+	go func() {
+		a = append(a, 1)
+		ch <- ""
+	}()
+	_ = cap(a)
+	<-ch
+}
+
+func TestNoRaceFuncArgsRW(t *testing.T) {
+	ch := make(chan byte, 1)
+	var x byte
+	go func(y byte) {
+		_ = y
+		ch <- 0
+	}(x)
+	x = 1
+	<-ch
+}
+
+func TestRaceFuncArgsRW(t *testing.T) {
+	ch := make(chan byte, 1)
+	var x byte
+	go func(y *byte) {
+		_ = *y
+		ch <- 0
+	}(&x)
+	x = 1
+	<-ch
+}
+
+// from the mailing list, slightly modified
+// unprotected concurrent access to seen[]
+func TestRaceCrawl(t *testing.T) {
+	url := "dummyurl"
+	depth := 3
+	seen := make(map[string]bool)
+	ch := make(chan int, 100)
+	var wg sync.WaitGroup
+	var crawl func(string, int)
+	crawl = func(u string, d int) {
+		nurl := 0
+		defer func() {
+			ch <- nurl
+		}()
+		seen[u] = true
+		if d <= 0 {
+			return
+		}
+		urls := [...]string{"a", "b", "c"}
+		for _, uu := range urls {
+			if _, ok := seen[uu]; !ok {
+				wg.Add(1)
+				go crawl(uu, d-1)
+				nurl++
+			}
+		}
+		wg.Done()
+	}
+	wg.Add(1)
+	go crawl(url, depth)
+	wg.Wait()
+}
+
+func TestRaceIndirection(t *testing.T) {
+	ch := make(chan struct{}, 1)
+	var y int
+	var x *int = &y
+	go func() {
+		*x = 1
+		ch <- struct{}{}
+	}()
+	*x = 2
+	<-ch
+	_ = *x
+}
+
+func TestRaceRune(t *testing.T) {
+	c := make(chan bool)
+	var x rune
+	go func() {
+		x = 1
+		c <- true
+	}()
+	_ = x
+	<-c
+}
+
+func TestRaceEmptyInterface1(t *testing.T) {
+	c := make(chan bool)
+	var x interface{}
+	go func() {
+		x = nil
+		c <- true
+	}()
+	_ = x
+	<-c
+}
+
+func TestRaceEmptyInterface2(t *testing.T) {
+	c := make(chan bool)
+	var x interface{}
+	go func() {
+		x = &Point{}
+		c <- true
+	}()
+	_ = x
+	<-c
+}
+
+func TestRaceTLS(t *testing.T) {
+	comm := make(chan *int)
+	done := make(chan bool, 2)
+	go func() {
+		var x int
+		comm <- &x
+		x = 1
+		x = *(<-comm)
+		done <- true
+	}()
+	go func() {
+		p := <-comm
+		*p = 2
+		comm <- p
+		done <- true
+	}()
+	<-done
+	<-done
+}
+
+func TestNoRaceHeapReallocation(t *testing.T) {
+	// It is possible that a future implementation
+	// of memory allocation will ruin this test.
+	// Increasing n might help in this case, so
+	// this test is a bit more generic than most of the
+	// others.
+	const n = 2
+	done := make(chan bool, n)
+	empty := func(p *int) {}
+	for i := 0; i < n; i++ {
+		ms := i
+		go func() {
+			<-time.After(time.Duration(ms) * time.Millisecond)
+			runtime.GC()
+			var x int
+			empty(&x) // x goes to the heap
+			done <- true
+		}()
+	}
+	for i := 0; i < n; i++ {
+		<-done
+	}
+}
+
+func TestRaceAnd(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if x == 1 && y == 1 {
+	}
+	<-c
+}
+
+// OANDAND is not instrumented in the compiler.
+func TestRaceFailingAnd2(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 0 && x == 1 {
+	}
+	<-c
+}
+
+func TestNoRaceAnd(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 1 && x == 1 {
+	}
+	<-c
+}
+
+func TestRaceOr(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if x == 1 || y == 1 {
+	}
+	<-c
+}
+
+// OOROR is not instrumented in the compiler.
+func TestRaceFailingOr2(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 1 || x == 1 {
+	}
+	<-c
+}
+
+func TestNoRaceOr(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		x = 1
+		c <- true
+	}()
+	if y == 0 || x == 1 {
+	}
+	<-c
+}
+
+func TestNoRaceShortCalc(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		y = 1
+		c <- true
+	}()
+	if x == 0 || y == 0 {
+	}
+	<-c
+}
+
+func TestNoRaceShortCalc2(t *testing.T) {
+	c := make(chan bool)
+	x, y := 0, 0
+	go func() {
+		y = 1
+		c <- true
+	}()
+	if x == 1 && y == 0 {
+	}
+	<-c
+}
+
+func TestRaceFuncItself(t *testing.T) {
+	c := make(chan bool)
+	f := func() {}
+	go func() {
+		f()
+		c <- true
+	}()
+	f = func() {}
+	<-c
+}
+
+func TestNoRaceFuncUnlock(t *testing.T) {
+	ch := make(chan bool, 1)
+	var mu sync.Mutex
+	x := 0
+	go func() {
+		mu.Lock()
+		x = 42
+		mu.Unlock()
+		ch <- true
+	}()
+	x = func(mu *sync.Mutex) int {
+		mu.Lock()
+		return 43
+	}(&mu)
+	mu.Unlock()
+	<-ch
+}
+
+func TestRaceStructInit(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := X{x: y}
+	_ = x
+	<-c
+}
+
+func TestRaceArrayInit(t *testing.T) {
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := []int{0, y, 42}
+	_ = x
+	<-c
+}
+
+func TestRaceMapInit(t *testing.T) {
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := map[int]int{0: 42, y: 42}
+	_ = x
+	<-c
+}
+
+func TestRaceMapInit2(t *testing.T) {
+	c := make(chan bool, 1)
+	y := 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	x := map[int]int{0: 42, 42: y}
+	_ = x
+	<-c
+}
+
+type Inter interface {
+	Foo(x int)
+}
+type InterImpl struct {
+	x, y int
+}
+
+func (p InterImpl) Foo(x int) {
+	// prevent inlining
+	z := 42
+	x = 85
+	y := x / z
+	z = y * z
+	x = z * y
+	_, _, _ = x, y, z
+}
+
+func TestRaceInterCall(t *testing.T) {
+	c := make(chan bool, 1)
+	p := InterImpl{}
+	var x Inter = p
+	go func() {
+		p2 := InterImpl{}
+		x = p2
+		c <- true
+	}()
+	x.Foo(0)
+	<-c
+}
+
+func TestRaceInterCall2(t *testing.T) {
+	c := make(chan bool, 1)
+	p := InterImpl{}
+	var x Inter = p
+	z := 0
+	go func() {
+		z = 42
+		c <- true
+	}()
+	x.Foo(z)
+	<-c
+}
+
+func TestRaceFuncCall(t *testing.T) {
+	c := make(chan bool, 1)
+	f := func(x, y int) {}
+	x, y := 0, 0
+	go func() {
+		y = 42
+		c <- true
+	}()
+	f(x, y)
+	<-c
+}
+
+func TestRaceMethodCall(t *testing.T) {
+	c := make(chan bool, 1)
+	i := InterImpl{}
+	x := 0
+	go func() {
+		x = 42
+		c <- true
+	}()
+	i.Foo(x)
+	<-c
+}
+
+func TestRaceMethodCall2(t *testing.T) {
+	c := make(chan bool, 1)
+	i := &InterImpl{}
+	go func() {
+		i = &InterImpl{}
+		c <- true
+	}()
+	i.Foo(0)
+	<-c
+}
+
+func TestRacePanicArg(t *testing.T) {
+	c := make(chan bool, 1)
+	err := errors.New("err")
+	go func() {
+		err = errors.New("err2")
+		c <- true
+	}()
+	defer func() {
+		recover()
+		<-c
+	}()
+	panic(err)
+}
+
+func TestRaceDeferArg(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	go func() {
+		x = 42
+		c <- true
+	}()
+	func() {
+		defer func(x int) {
+		}(x)
+	}()
+	<-c
+}
+
+type DeferT int
+
+func (d DeferT) Foo() {
+}
+
+func TestRaceDeferArg2(t *testing.T) {
+	c := make(chan bool, 1)
+	var x DeferT
+	go func() {
+		var y DeferT
+		x = y
+		c <- true
+	}()
+	func() {
+		defer x.Foo()
+	}()
+	<-c
+}
+
+func TestNoRaceAddrExpr(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	go func() {
+		x = 42
+		c <- true
+	}()
+	_ = &x
+	<-c
+}
+
+type AddrT struct {
+	_ [256]byte
+	x int
+}
+
+type AddrT2 struct {
+	_ [512]byte
+	p *AddrT
+}
+
+func TestRaceAddrExpr(t *testing.T) {
+	c := make(chan bool, 1)
+	a := AddrT2{p: &AddrT{x: 42}}
+	go func() {
+		a.p = &AddrT{x: 43}
+		c <- true
+	}()
+	_ = &a.p.x
+	<-c
+}
+
+func TestRaceTypeAssert(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	var i interface{} = x
+	go func() {
+		y := 0
+		i = y
+		c <- true
+	}()
+	_ = i.(int)
+	<-c
+}
+
+func TestRaceBlockAs(t *testing.T) {
+	c := make(chan bool, 1)
+	var x, y int
+	go func() {
+		x = 42
+		c <- true
+	}()
+	x, y = y, x
+	<-c
+}
+
+func TestRaceSliceSlice(t *testing.T) {
+	c := make(chan bool, 1)
+	x := make([]int, 10)
+	go func() {
+		x = make([]int, 20)
+		c <- true
+	}()
+	_ = x[2:3]
+	<-c
+}
+
+func TestRaceSliceSlice2(t *testing.T) {
+	c := make(chan bool, 1)
+	x := make([]int, 10)
+	i := 2
+	go func() {
+		i = 3
+		c <- true
+	}()
+	_ = x[i:4]
+	<-c
+}
+
+// http://golang.org/issue/4453
+func TestRaceFailingSliceStruct(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	c := make(chan bool, 1)
+	x := make([]X, 10)
+	go func() {
+		y := make([]X, 10)
+		copy(y, x)
+		c <- true
+	}()
+	x[1].y = 42
+	<-c
+}
+
+func TestRaceStructInd(t *testing.T) {
+	c := make(chan bool, 1)
+	type Item struct {
+		x, y int
+	}
+	i := Item{}
+	go func(p *Item) {
+		*p = Item{}
+		c <- true
+	}(&i)
+	i.y = 42
+	<-c
+}
+
+func TestRaceAsFunc1(t *testing.T) {
+	var s []byte
+	c := make(chan bool, 1)
+	go func() {
+		var err error
+		s, err = func() ([]byte, error) {
+			t := []byte("hello world")
+			return t, nil
+		}()
+		c <- true
+		_ = err
+	}()
+	_ = string(s)
+	<-c
+}
+
+func TestRaceAsFunc2(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	go func() {
+		func(x int) {
+		}(x)
+		c <- true
+	}()
+	x = 42
+	<-c
+}
+
+func TestRaceAsFunc3(t *testing.T) {
+	c := make(chan bool, 1)
+	var mu sync.Mutex
+	x := 0
+	go func() {
+		func(x int) {
+			mu.Lock()
+		}(x) // Read of x must be outside of the mutex.
+		mu.Unlock()
+		c <- true
+	}()
+	mu.Lock()
+	x = 42
+	mu.Unlock()
+	<-c
+}
+
+func TestNoRaceAsFunc4(t *testing.T) {
+	c := make(chan bool, 1)
+	var mu sync.Mutex
+	x := 0
+	go func() {
+		x = func() int { // Write of x must be under the mutex.
+			mu.Lock()
+			return 42
+		}()
+		mu.Unlock()
+		c <- true
+	}()
+	mu.Lock()
+	x = 42
+	mu.Unlock()
+	<-c
+}
+
+func TestRaceHeapParam(t *testing.T) {
+	x := func() (x int) {
+		go func() {
+			x = 42
+		}()
+		return
+	}()
+	_ = x
+}
+
+func TestNoRaceEmptyStruct(t *testing.T) {
+	type Empty struct{}
+	type X struct {
+		y int64
+		Empty
+	}
+	type Y struct {
+		x X
+		y int64
+	}
+	c := make(chan X)
+	var y Y
+	go func() {
+		x := y.x
+		c <- x
+	}()
+	y.y = 42
+	<-c
+}
+
+func TestRaceNestedStruct(t *testing.T) {
+	type X struct {
+		x, y int
+	}
+	type Y struct {
+		x X
+	}
+	c := make(chan Y)
+	var y Y
+	go func() {
+		c <- y
+	}()
+	y.x.y = 42
+	<-c
+}
diff --git a/src/pkg/runtime/race/testdata/mutex_test.go b/src/pkg/runtime/race/testdata/mutex_test.go
new file mode 100644
index 000000000..3cf03ae6b
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/mutex_test.go
@@ -0,0 +1,138 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceMutex(t *testing.T) {
+	var mu sync.Mutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 1
+		ch <- true
+	}()
+	go func() {
+		mu.Lock()
+		x = 2
+		mu.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceMutex(t *testing.T) {
+	var mu sync.Mutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		x = 1
+		mu.Lock()
+		defer mu.Unlock()
+		ch <- true
+	}()
+	go func() {
+		x = 2
+		mu.Lock()
+		mu.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceMutex2(t *testing.T) {
+	var mu1 sync.Mutex
+	var mu2 sync.Mutex
+	var x int8 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		mu1.Lock()
+		defer mu1.Unlock()
+		x = 1
+		ch <- true
+	}()
+	go func() {
+		mu2.Lock()
+		x = 2
+		mu2.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceMutexPureHappensBefore(t *testing.T) {
+	var mu sync.Mutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		x = 1
+		mu.Lock()
+		mu.Unlock()
+		ch <- true
+	}()
+	go func() {
+		<-time.After(1e5)
+		mu.Lock()
+		mu.Unlock()
+		x = 1
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceMutexSemaphore(t *testing.T) {
+	var mu sync.Mutex
+	ch := make(chan bool, 2)
+	x := 0
+	mu.Lock()
+	go func() {
+		x = 1
+		mu.Unlock()
+		ch <- true
+	}()
+	go func() {
+		mu.Lock()
+		x = 2
+		mu.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+// from doc/go_mem.html
+func TestNoRaceMutexExampleFromHtml(t *testing.T) {
+	var l sync.Mutex
+	a := ""
+
+	l.Lock()
+	go func() {
+		a = "hello, world"
+		l.Unlock()
+	}()
+	l.Lock()
+	_ = a
+}
+
+func TestRaceMutexOverwrite(t *testing.T) {
+	c := make(chan bool, 1)
+	var mu sync.Mutex
+	go func() {
+		mu = sync.Mutex{}
+		c <- true
+	}()
+	mu.Lock()
+	<-c
+}
diff --git a/src/pkg/runtime/race/testdata/regression_test.go b/src/pkg/runtime/race/testdata/regression_test.go
new file mode 100644
index 000000000..afe8cc5ec
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/regression_test.go
@@ -0,0 +1,150 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code patterns that caused problems in the past.
+
+package race_test
+
+import (
+	"testing"
+)
+
+type LogImpl struct {
+	x int
+}
+
+func NewLog() (l LogImpl) {
+	c := make(chan bool)
+	go func() {
+		_ = l
+		c <- true
+	}()
+	l = LogImpl{}
+	<-c
+	return
+}
+
+var _ LogImpl = NewLog()
+
+func MakeMap() map[int]int {
+	return make(map[int]int)
+}
+
+func InstrumentMapLen() {
+	_ = len(MakeMap())
+}
+
+func InstrumentMapLen2() {
+	m := make(map[int]map[int]int)
+	_ = len(m[0])
+}
+
+func InstrumentMapLen3() {
+	m := make(map[int]*map[int]int)
+	_ = len(*m[0])
+}
+
+type Rect struct {
+	x, y int
+}
+
+type Image struct {
+	min, max Rect
+}
+
+func NewImage() Image {
+	var pleaseDoNotInlineMe stack
+	pleaseDoNotInlineMe.push(1)
+	_ = pleaseDoNotInlineMe.pop()
+	return Image{}
+}
+
+func AddrOfTemp() {
+	_ = NewImage().min
+}
+
+type TypeID int
+
+func (t *TypeID) encodeType(x int) (tt TypeID, err error) {
+	switch x {
+	case 0:
+		return t.encodeType(x * x)
+	}
+	return 0, nil
+}
+
+type stack []int
+
+func (s *stack) push(x int) {
+	*s = append(*s, x)
+}
+
+func (s *stack) pop() int {
+	i := len(*s)
+	n := (*s)[i-1]
+	*s = (*s)[:i-1]
+	return n
+}
+
+func TestNoRaceStackPushPop(t *testing.T) {
+	var s stack
+	go func(s *stack) {}(&s)
+	s.push(1)
+	x := s.pop()
+	_ = x
+}
+
+type RpcChan struct {
+	c chan bool
+}
+
+var makeChanCalls int
+
+func makeChan() *RpcChan {
+	var pleaseDoNotInlineMe stack
+	pleaseDoNotInlineMe.push(1)
+	_ = pleaseDoNotInlineMe.pop()
+
+	makeChanCalls++
+	c := &RpcChan{make(chan bool, 1)}
+	c.c <- true
+	return c
+}
+
+func call() bool {
+	x := <-makeChan().c
+	return x
+}
+
+func TestNoRaceRpcChan(t *testing.T) {
+	makeChanCalls = 0
+	_ = call()
+	if makeChanCalls != 1 {
+		t.Fatalf("makeChanCalls %d, expected 1\n", makeChanCalls)
+	}
+}
+
+func divInSlice() {
+	v := make([]int64, 10)
+	i := 1
+	_ = v[(i*4)/3]
+}
+
+func TestNoRaceReturn(t *testing.T) {
+	c := make(chan int)
+	noRaceReturn(c)
+	<-c
+}
+
+// Return used to do an implicit a = a, causing a read/write race
+// with the goroutine. Compiler has an optimization to avoid that now.
+// See issue 4014.
+func noRaceReturn(c chan int) (a, b int) {
+	a = 42
+	go func() {
+		_ = a
+		c <- 1
+	}()
+	return a, 10
+}
diff --git a/src/pkg/runtime/race/testdata/rwmutex_test.go b/src/pkg/runtime/race/testdata/rwmutex_test.go
new file mode 100644
index 000000000..85cb5df3c
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/rwmutex_test.go
@@ -0,0 +1,134 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestRaceMutexRWMutex(t *testing.T) {
+	var mu1 sync.Mutex
+	var mu2 sync.RWMutex
+	var x int16 = 0
+	ch := make(chan bool, 2)
+	go func() {
+		mu1.Lock()
+		defer mu1.Unlock()
+		x = 1
+		ch <- true
+	}()
+	go func() {
+		mu2.Lock()
+		x = 2
+		mu2.Unlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestNoRaceRWMutex(t *testing.T) {
+	var mu sync.RWMutex
+	var x, y int64 = 0, 1
+	ch := make(chan bool, 2)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 2
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y = x
+		mu.RUnlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+}
+
+func TestRaceRWMutexMultipleReaders(t *testing.T) {
+	var mu sync.RWMutex
+	var x, y int64 = 0, 1
+	ch := make(chan bool, 3)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 2
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y = x + 1
+		mu.RUnlock()
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y = x + 2
+		mu.RUnlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+	<-ch
+	_ = y
+}
+
+func TestNoRaceRWMutexMultipleReaders(t *testing.T) {
+	var mu sync.RWMutex
+	x := int64(0)
+	ch := make(chan bool, 3)
+	go func() {
+		mu.Lock()
+		defer mu.Unlock()
+		x = 2
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y := x + 1
+		_ = y
+		mu.RUnlock()
+		ch <- true
+	}()
+	go func() {
+		mu.RLock()
+		y := x + 2
+		_ = y
+		mu.RUnlock()
+		ch <- true
+	}()
+	<-ch
+	<-ch
+	<-ch
+}
+
+func TestNoRaceRWMutexTransitive(t *testing.T) {
+	var mu sync.RWMutex
+	x := int64(0)
+	ch := make(chan bool, 2)
+	go func() {
+		mu.RLock()
+		_ = x
+		mu.RUnlock()
+		ch <- true
+	}()
+	go func() {
+		time.Sleep(1e7)
+		mu.RLock()
+		_ = x
+		mu.RUnlock()
+		ch <- true
+	}()
+	time.Sleep(2e7)
+	mu.Lock()
+	x = 42
+	mu.Unlock()
+	<-ch
+	<-ch
+}
diff --git a/src/pkg/runtime/race/testdata/select_test.go b/src/pkg/runtime/race/testdata/select_test.go
new file mode 100644
index 000000000..4a3a23647
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/select_test.go
@@ -0,0 +1,286 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"testing"
+)
+
+func TestNoRaceSelect1(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+
+	go func() {
+		x = 1
+		// At least two channels are needed because
+		// otherwise the compiler optimizes select out.
+		// See comment in runtime/chan.c:^selectgo.
+		select {
+		case c <- true:
+		case c1 <- true:
+		}
+		compl <- true
+	}()
+	select {
+	case <-c:
+	case c1 <- true:
+	}
+	x = 2
+	<-compl
+}
+
+func TestNoRaceSelect2(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+	go func() {
+		select {
+		case <-c:
+		case <-c1:
+		}
+		x = 1
+		compl <- true
+	}()
+	x = 2
+	close(c)
+	runtime.Gosched()
+	<-compl
+}
+
+func TestNoRaceSelect3(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool, 10)
+	c1 := make(chan bool)
+	go func() {
+		x = 1
+		select {
+		case c <- true:
+		case <-c1:
+		}
+		compl <- true
+	}()
+	<-c
+	x = 2
+	<-compl
+}
+
+func TestNoRaceSelect4(t *testing.T) {
+	type Task struct {
+		f    func()
+		done chan bool
+	}
+
+	queue := make(chan Task)
+	dummy := make(chan bool)
+
+	go func() {
+		for {
+			select {
+			case t := <-queue:
+				t.f()
+				t.done <- true
+			}
+		}
+	}()
+
+	doit := func(f func()) {
+		done := make(chan bool, 1)
+		select {
+		case queue <- Task{f, done}:
+		case <-dummy:
+		}
+		select {
+		case <-done:
+		case <-dummy:
+		}
+	}
+
+	var x int
+	doit(func() {
+		x = 1
+	})
+	_ = x
+}
+
+func TestNoRaceSelect5(t *testing.T) {
+	test := func(sel, needSched bool) {
+		var x int
+		ch := make(chan bool)
+		c1 := make(chan bool)
+
+		done := make(chan bool, 2)
+		go func() {
+			if needSched {
+				runtime.Gosched()
+			}
+			// println(1)
+			x = 1
+			if sel {
+				select {
+				case ch <- true:
+				case <-c1:
+				}
+			} else {
+				ch <- true
+			}
+			done <- true
+		}()
+
+		go func() {
+			// println(2)
+			if sel {
+				select {
+				case <-ch:
+				case <-c1:
+				}
+			} else {
+				<-ch
+			}
+			x = 1
+			done <- true
+		}()
+		<-done
+		<-done
+	}
+
+	test(true, true)
+	test(true, false)
+	test(false, true)
+	test(false, false)
+}
+
+func TestRaceSelect1(t *testing.T) {
+	var x int
+	compl := make(chan bool, 2)
+	c := make(chan bool)
+	c1 := make(chan bool)
+
+	go func() {
+		<-c
+		<-c
+	}()
+	f := func() {
+		select {
+		case c <- true:
+		case c1 <- true:
+		}
+		x = 1
+		compl <- true
+	}
+	go f()
+	go f()
+	<-compl
+	<-compl
+}
+
+func TestRaceSelect2(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+	go func() {
+		x = 1
+		select {
+		case <-c:
+		case <-c1:
+		}
+		compl <- true
+	}()
+	close(c)
+	x = 2
+	<-compl
+}
+
+func TestRaceSelect3(t *testing.T) {
+	var x int
+	compl := make(chan bool)
+	c := make(chan bool)
+	c1 := make(chan bool)
+	go func() {
+		x = 1
+		select {
+		case c <- true:
+		case c1 <- true:
+		}
+		compl <- true
+	}()
+	x = 2
+	select {
+	case <-c:
+	}
+	<-compl
+}
+
+func TestRaceSelect4(t *testing.T) {
+	done := make(chan bool, 1)
+	var x int
+	go func() {
+		select {
+		default:
+			x = 2
+		}
+		done <- true
+	}()
+	_ = x
+	<-done
+}
+
+// The idea behind this test:
+// there are two variables, access to one
+// of them is synchronized, access to the other
+// is not.
+// Select must (unconditionaly) choose the non-synchronized variable
+// thus causing exactly one race.
+// Currently this test doesn't look like it accomplishes
+// this goal.
+func TestRaceSelect5(t *testing.T) {
+	done := make(chan bool, 1)
+	c1 := make(chan bool, 1)
+	c2 := make(chan bool)
+	var x, y int
+	go func() {
+		select {
+		case c1 <- true:
+			x = 1
+		case c2 <- true:
+			y = 1
+		}
+		done <- true
+	}()
+	_ = x
+	_ = y
+	<-done
+}
+
+// select statements may introduce
+// flakiness: whether this test contains
+// a race depends on the scheduling
+// (some may argue that the code contains
+// this race by definition)
+/*
+func TestFlakyDefault(t *testing.T) {
+	var x int
+	c := make(chan bool, 1)
+	done := make(chan bool, 1)
+	go func() {
+		select {
+		case <-c:
+			x = 2
+		default:
+			x = 3
+		}
+		done <- true
+	}()
+	x = 1
+	c <- true
+	_ = x
+	<-done
+}
+*/
diff --git a/src/pkg/runtime/race/testdata/slice_test.go b/src/pkg/runtime/race/testdata/slice_test.go
new file mode 100644
index 000000000..773463662
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/slice_test.go
@@ -0,0 +1,465 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"testing"
+)
+
+func TestRaceSliceRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 2)
+	go func() {
+		a[1] = 1
+		ch <- true
+	}()
+	_ = a[1]
+	<-ch
+}
+
+func TestNoRaceSliceRW(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 2)
+	go func() {
+		a[0] = 1
+		ch <- true
+	}()
+	_ = a[1]
+	<-ch
+}
+
+func TestRaceSliceWW(t *testing.T) {
+	a := make([]int, 10)
+	ch := make(chan bool, 1)
+	go func() {
+		a[1] = 1
+		ch <- true
+	}()
+	a[1] = 2
+	<-ch
+}
+
+func TestNoRaceArrayWW(t *testing.T) {
+	var a [5]int
+	ch := make(chan bool, 1)
+	go func() {
+		a[0] = 1
+		ch <- true
+	}()
+	a[1] = 2
+	<-ch
+}
+
+func TestRaceArrayWW(t *testing.T) {
+	var a [5]int
+	ch := make(chan bool, 1)
+	go func() {
+		a[1] = 1
+		ch <- true
+	}()
+	a[1] = 2
+	<-ch
+}
+
+func TestNoRaceSliceWriteLen(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]bool, 1)
+	go func() {
+		a[0] = true
+		ch <- true
+	}()
+	_ = len(a)
+	<-ch
+}
+
+func TestNoRaceSliceWriteCap(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]uint64, 100)
+	go func() {
+		a[50] = 123
+		ch <- true
+	}()
+	_ = cap(a)
+	<-ch
+}
+
+func TestRaceSliceCopyRead(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		_ = a[5]
+		ch <- true
+	}()
+	copy(a, b)
+	<-ch
+}
+
+func TestNoRaceSliceWriteCopy(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		a[5] = 1
+		ch <- true
+	}()
+	copy(a[:5], b[:5])
+	<-ch
+}
+
+func TestRaceSliceCopyWrite2(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		b[5] = 1
+		ch <- true
+	}()
+	copy(a, b)
+	<-ch
+}
+
+func TestRaceSliceCopyWrite3(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]byte, 10)
+	go func() {
+		a[7] = 1
+		ch <- true
+	}()
+	copy(a, "qwertyqwerty")
+	<-ch
+}
+
+func TestNoRaceSliceCopyRead(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]int, 10)
+	b := make([]int, 10)
+	go func() {
+		_ = b[5]
+		ch <- true
+	}()
+	copy(a, b)
+	<-ch
+}
+
+func TestNoRaceSliceWriteSlice2(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]float64, 10)
+	go func() {
+		a[2] = 1.0
+		ch <- true
+	}()
+	_ = a[0:5]
+	<-ch
+}
+
+func TestRaceSliceWriteSlice(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]float64, 10)
+	go func() {
+		a[2] = 1.0
+		ch <- true
+	}()
+	a = a[5:10]
+	<-ch
+}
+
+func TestNoRaceSliceWriteSlice(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]float64, 10)
+	go func() {
+		a[2] = 1.0
+		ch <- true
+	}()
+	_ = a[5:10]
+	<-ch
+}
+
+func TestNoRaceSliceLenCap(t *testing.T) {
+	ch := make(chan bool, 1)
+	a := make([]struct{}, 10)
+	go func() {
+		_ = len(a)
+		ch <- true
+	}()
+	_ = cap(a)
+	<-ch
+}
+
+func TestNoRaceStructSlicesRangeWrite(t *testing.T) {
+	type Str struct {
+		a []int
+		b []int
+	}
+	ch := make(chan bool, 1)
+	var s Str
+	s.a = make([]int, 10)
+	s.b = make([]int, 10)
+	go func() {
+		for _ = range s.a {
+		}
+		ch <- true
+	}()
+	s.b[5] = 5
+	<-ch
+}
+
+func TestRaceSliceDifferent(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	s2 := s
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	// false negative because s2 is PAUTO w/o PHEAP
+	// so we do not instrument it
+	s2[3] = 3
+	<-c
+}
+
+func TestRaceSliceRangeWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	for _, v := range s {
+		_ = v
+	}
+	<-c
+}
+
+func TestNoRaceSliceRangeWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	for _ = range s {
+	}
+	<-c
+}
+
+func TestRaceSliceRangeAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s = append(s, 3)
+		c <- true
+	}()
+	for _ = range s {
+	}
+	<-c
+}
+
+func TestNoRaceSliceRangeAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = append(s, 3)
+		c <- true
+	}()
+	for _ = range s {
+	}
+	<-c
+}
+
+func TestRaceSliceVarWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s[3] = 3
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarRead(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = s[3]
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarRange(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		for _ = range s {
+		}
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = append(s, 10)
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarCopy(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s2 := make([]int, 10)
+		copy(s, s2)
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+func TestRaceSliceVarCopy2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s2 := make([]int, 10)
+		copy(s2, s)
+		c <- true
+	}()
+	s = make([]int, 20)
+	<-c
+}
+
+// Not implemented.
+func TestRaceFailingSliceAppend(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10, 20)
+	go func() {
+		_ = append(s, 1)
+		c <- true
+	}()
+	_ = append(s, 2)
+	<-c
+}
+
+func TestRaceSliceAppendWrite(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		_ = append(s, 1)
+		c <- true
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestRaceSliceAppendSlice(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	go func() {
+		s2 := make([]int, 10)
+		_ = append(s, s2...)
+		c <- true
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestRaceSliceAppendSlice2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	s2foobar := make([]int, 10)
+	go func() {
+		_ = append(s, s2foobar...)
+		c <- true
+	}()
+	s2foobar[5] = 42
+	<-c
+}
+
+func TestRaceSliceAppendString(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]byte, 10)
+	go func() {
+		_ = append(s, "qwerty"...)
+		c <- true
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestNoRaceSliceIndexAccess(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		_ = v
+		c <- true
+	}()
+	s[v] = 1
+	<-c
+}
+
+func TestNoRaceSliceIndexAccess2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		_ = v
+		c <- true
+	}()
+	_ = s[v]
+	<-c
+}
+
+func TestRaceSliceIndexAccess(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		v = 1
+		c <- true
+	}()
+	s[v] = 1
+	<-c
+}
+
+func TestRaceSliceIndexAccess2(t *testing.T) {
+	c := make(chan bool, 1)
+	s := make([]int, 10)
+	v := 0
+	go func() {
+		v = 1
+		c <- true
+	}()
+	_ = s[v]
+	<-c
+}
+
+func TestRaceSliceByteToString(t *testing.T) {
+	c := make(chan string)
+	s := make([]byte, 10)
+	go func() {
+		c <- string(s)
+	}()
+	s[0] = 42
+	<-c
+}
+
+func TestRaceSliceRuneToString(t *testing.T) {
+	c := make(chan string)
+	s := make([]rune, 10)
+	go func() {
+		c <- string(s)
+	}()
+	s[9] = 42
+	<-c
+}
diff --git a/src/pkg/runtime/race/testdata/sync_test.go b/src/pkg/runtime/race/testdata/sync_test.go
new file mode 100644
index 000000000..e80ba3b74
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/sync_test.go
@@ -0,0 +1,197 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceCond(t *testing.T) { // tsan's test02
+	ch := make(chan bool, 1)
+	var x int = 0
+	var mu sync.Mutex
+	var cond *sync.Cond = sync.NewCond(&mu)
+	var condition int = 0
+	var waker func()
+	waker = func() {
+		x = 1
+		mu.Lock()
+		condition = 1
+		cond.Signal()
+		mu.Unlock()
+	}
+
+	var waiter func()
+	waiter = func() {
+		go waker()
+		cond.L.Lock()
+		for condition != 1 {
+			cond.Wait()
+		}
+		cond.L.Unlock()
+		x = 2
+		ch <- true
+	}
+	go waiter()
+	<-ch
+}
+
+func TestRaceCond(t *testing.T) { // tsan's test50
+	ch := make(chan bool, 2)
+
+	var x int = 0
+	var mu sync.Mutex
+	var condition int = 0
+	var cond *sync.Cond = sync.NewCond(&mu)
+
+	var waker func() = func() {
+		<-time.After(1e5)
+		x = 1
+		mu.Lock()
+		condition = 1
+		cond.Signal()
+		mu.Unlock()
+		<-time.After(1e5)
+		mu.Lock()
+		x = 3
+		mu.Unlock()
+		ch <- true
+	}
+
+	var waiter func() = func() {
+		mu.Lock()
+		for condition != 1 {
+			cond.Wait()
+		}
+		mu.Unlock()
+		x = 2
+		ch <- true
+	}
+	x = 0
+	go waker()
+	go waiter()
+	<-ch
+	<-ch
+}
+
+// We do not currently automatically
+// parse this test. It is intended that the creation
+// stack is observed manually not to contain
+// off-by-one errors
+func TestRaceAnnounceThreads(t *testing.T) {
+	const N = 7
+	allDone := make(chan bool, N)
+
+	var x int
+
+	var f, g, h func()
+	f = func() {
+		x = 1
+		go g()
+		go func() {
+			x = 1
+			allDone <- true
+		}()
+		x = 2
+		allDone <- true
+	}
+
+	g = func() {
+		for i := 0; i < 2; i++ {
+			go func() {
+				x = 1
+				allDone <- true
+			}()
+			allDone <- true
+		}
+	}
+
+	h = func() {
+		x = 1
+		x = 2
+		go f()
+		allDone <- true
+	}
+
+	go h()
+
+	for i := 0; i < N; i++ {
+		<-allDone
+	}
+}
+
+func TestNoRaceAfterFunc1(t *testing.T) {
+	i := 2
+	c := make(chan bool)
+	var f func()
+	f = func() {
+		i--
+		if i >= 0 {
+			time.AfterFunc(0, f)
+		} else {
+			c <- true
+		}
+	}
+
+	time.AfterFunc(0, f)
+	<-c
+}
+
+func TestNoRaceAfterFunc2(t *testing.T) {
+	var x int
+	timer := time.AfterFunc(10, func() {
+		x = 1
+	})
+	defer timer.Stop()
+	_ = x
+}
+
+func TestNoRaceAfterFunc3(t *testing.T) {
+	c := make(chan bool, 1)
+	x := 0
+	time.AfterFunc(1e7, func() {
+		x = 1
+		c <- true
+	})
+	<-c
+}
+
+func TestRaceAfterFunc3(t *testing.T) {
+	c := make(chan bool, 2)
+	x := 0
+	time.AfterFunc(1e7, func() {
+		x = 1
+		c <- true
+	})
+	time.AfterFunc(2e7, func() {
+		x = 2
+		c <- true
+	})
+	<-c
+	<-c
+}
+
+// This test's output is intended to be
+// observed manually. One should check
+// that goroutine creation stack is
+// comprehensible.
+func TestRaceGoroutineCreationStack(t *testing.T) {
+	var x int
+	var ch = make(chan bool, 1)
+
+	f1 := func() {
+		x = 1
+		ch <- true
+	}
+	f2 := func() { go f1() }
+	f3 := func() { go f2() }
+	f4 := func() { go f3() }
+
+	go f4()
+	x = 2
+	<-ch
+}
diff --git a/src/pkg/runtime/race/testdata/waitgroup_test.go b/src/pkg/runtime/race/testdata/waitgroup_test.go
new file mode 100644
index 000000000..7ea21fa7e
--- /dev/null
+++ b/src/pkg/runtime/race/testdata/waitgroup_test.go
@@ -0,0 +1,232 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestNoRaceWaitGroup(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	n := 1
+	for i := 0; i < n; i++ {
+		wg.Add(1)
+		j := i
+		go func() {
+			x = j
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func TestRaceWaitGroup(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	n := 2
+	for i := 0; i < n; i++ {
+		wg.Add(1)
+		j := i
+		go func() {
+			x = j
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func TestNoRaceWaitGroup2(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		x = 1
+		wg.Done()
+	}()
+	wg.Wait()
+	x = 2
+}
+
+// incrementing counter in Add and locking wg's mutex
+func TestRaceWaitGroupAsMutex(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	c := make(chan bool, 2)
+	go func() {
+		wg.Wait()
+		time.Sleep(100 * time.Millisecond)
+		wg.Add(+1)
+		x = 1
+		wg.Add(-1)
+		c <- true
+	}()
+	go func() {
+		wg.Wait()
+		time.Sleep(100 * time.Millisecond)
+		wg.Add(+1)
+		x = 2
+		wg.Add(-1)
+		c <- true
+	}()
+	<-c
+	<-c
+}
+
+// Incorrect usage: Add is too late.
+func TestRaceWaitGroupWrongWait(t *testing.T) {
+	c := make(chan bool, 2)
+	var x int
+	var wg sync.WaitGroup
+	go func() {
+		wg.Add(1)
+		runtime.Gosched()
+		x = 1
+		wg.Done()
+		c <- true
+	}()
+	go func() {
+		wg.Add(1)
+		runtime.Gosched()
+		x = 2
+		wg.Done()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+// A common WaitGroup misuse that can potentially be caught be the race detector.
+// For this simple case we must emulate Add() as read on &wg and Wait() as write on &wg.
+// However it will have false positives if there are several concurrent Wait() calls.
+func TestRaceFailingWaitGroupWrongAdd(t *testing.T) {
+	c := make(chan bool, 2)
+	var wg sync.WaitGroup
+	go func() {
+		wg.Add(1)
+		wg.Done()
+		c <- true
+	}()
+	go func() {
+		wg.Add(1)
+		wg.Done()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+func TestNoRaceWaitGroupMultipleWait(t *testing.T) {
+	c := make(chan bool, 2)
+	var wg sync.WaitGroup
+	go func() {
+		wg.Wait()
+		c <- true
+	}()
+	go func() {
+		wg.Wait()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+func TestNoRaceWaitGroupMultipleWait2(t *testing.T) {
+	c := make(chan bool, 2)
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		wg.Done()
+		wg.Wait()
+		c <- true
+	}()
+	go func() {
+		wg.Done()
+		wg.Wait()
+		c <- true
+	}()
+	wg.Wait()
+	<-c
+	<-c
+}
+
+// Correct usage but still a race
+func TestRaceWaitGroup2(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		x = 1
+		wg.Done()
+	}()
+	go func() {
+		x = 2
+		wg.Done()
+	}()
+	wg.Wait()
+}
+
+func TestNoRaceWaitGroupPanicRecover(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	defer func() {
+		err := recover()
+		if err != "sync: negative WaitGroup counter" {
+			t.Fatalf("Unexpected panic: %#v", err)
+		}
+		x = 2
+	}()
+	x = 1
+	wg.Add(-1)
+}
+
+// TODO: this is actually a panic-synchronization test, not a
+// WaitGroup test. Move it to another *_test file
+// Is it possible to get a race by synchronization via panic?
+func TestNoRaceWaitGroupPanicRecover2(t *testing.T) {
+	var x int
+	var wg sync.WaitGroup
+	ch := make(chan bool, 1)
+	var f func() = func() {
+		x = 2
+		ch <- true
+	}
+	go func() {
+		defer func() {
+			err := recover()
+			if err != "sync: negative WaitGroup counter" {
+			}
+			go f()
+		}()
+		x = 1
+		wg.Add(-1)
+	}()
+
+	<-ch
+}
+
+func TestNoRaceWaitGroupTransitive(t *testing.T) {
+	x, y := 0, 0
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		x = 42
+		wg.Done()
+	}()
+	go func() {
+		time.Sleep(1e7)
+		y = 42
+		wg.Done()
+	}()
+	wg.Wait()
+	_ = x
+	_ = y
+}
diff --git a/src/pkg/runtime/race0.c b/src/pkg/runtime/race0.c
new file mode 100644
index 000000000..1c5f05a7e
--- /dev/null
+++ b/src/pkg/runtime/race0.c
@@ -0,0 +1,133 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Stub implementation of the race detector API.
+// +build !race
+
+#include "runtime.h"
+
+uintptr
+runtime·raceinit(void)
+{
+	return 0;
+}
+
+void
+runtime·racefini(void)
+{
+}
+
+
+void
+runtime·racemapshadow(void *addr, uintptr size)
+{
+	USED(addr);
+	USED(size);
+}
+
+void
+runtime·racewritepc(void *addr, void *callpc, void *pc)
+{
+	USED(addr);
+	USED(callpc);
+	USED(pc);
+}
+
+void
+runtime·racereadpc(void *addr, void *callpc, void *pc)
+{
+	USED(addr);
+	USED(callpc);
+	USED(pc);
+}
+
+void
+runtime·racewriterangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc)
+{
+	USED(addr);
+	USED(sz);
+	USED(step);
+	USED(callpc);
+	USED(pc);
+}
+
+void
+runtime·racereadrangepc(void *addr, uintptr sz, uintptr step, void *callpc, void *pc)
+{
+	USED(addr);
+	USED(sz);
+	USED(step);
+	USED(callpc);
+	USED(pc);
+}
+
+void
+runtime·raceacquire(void *addr)
+{
+	USED(addr);
+}
+
+void
+runtime·raceacquireg(G *gp, void *addr)
+{
+	USED(gp);
+	USED(addr);
+}
+
+void
+runtime·racerelease(void *addr)
+{
+	USED(addr);
+}
+
+void
+runtime·racereleaseg(G *gp, void *addr)
+{
+	USED(gp);
+	USED(addr);
+}
+
+void
+runtime·racereleasemerge(void *addr)
+{
+	USED(addr);
+}
+
+void
+runtime·racereleasemergeg(G *gp, void *addr)
+{
+	USED(gp);
+	USED(addr);
+}
+
+void
+runtime·racefingo(void)
+{
+}
+
+void
+runtime·racemalloc(void *p, uintptr sz, void *pc)
+{
+	USED(p);
+	USED(sz);
+	USED(pc);
+}
+
+void
+runtime·racefree(void *p)
+{
+	USED(p);
+}
+
+uintptr
+runtime·racegostart(void *pc)
+{
+	USED(pc);
+	return 0;
+}
+
+void
+runtime·racegoend(void)
+{
+}
diff --git a/src/pkg/runtime/race_amd64.s b/src/pkg/runtime/race_amd64.s
new file mode 100644
index 000000000..83e300905
--- /dev/null
+++ b/src/pkg/runtime/race_amd64.s
@@ -0,0 +1,14 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+// func runtime·racefuncenter(pc uintptr)
+TEXT	runtime·racefuncenter(SB), 7, $16
+	MOVQ	DX, saved-8(SP) // save function entry context (for closures)
+	MOVQ	pc+0(FP), DX
+	MOVQ	DX, arg-16(SP)
+	CALL	runtime·racefuncenter1(SB)
+	MOVQ	saved-8(SP), DX
+	RET
diff --git a/src/pkg/runtime/rt0_freebsd_arm.s b/src/pkg/runtime/rt0_freebsd_arm.s
new file mode 100644
index 000000000..085fccf9d
--- /dev/null
+++ b/src/pkg/runtime/rt0_freebsd_arm.s
@@ -0,0 +1,8 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// FreeBSD and Linux use the same linkage to main
+
+TEXT _rt0_arm_freebsd(SB),7,$-4
+	B	_rt0_arm(SB)
diff --git a/src/pkg/runtime/rt0_linux_arm.s b/src/pkg/runtime/rt0_linux_arm.s
index e08cf907d..a648160cf 100644
--- a/src/pkg/runtime/rt0_linux_arm.s
+++ b/src/pkg/runtime/rt0_linux_arm.s
@@ -20,15 +20,27 @@ TEXT _rt0_arm_linux(SB),7,$-4
 	MOVM.DB.W [R0-R3], (R13)
 	MOVW	$4, R0 // SIGILL
 	MOVW	R13, R1 // sa
-	MOVW	$0, R2 // old_sa
+	SUB	$16, R13
+	MOVW	R13, R2 // old_sa
 	MOVW	$8, R3 // c
 	MOVW	$174, R7 // sys_sigaction
 	BL	oabi_syscall<>(SB)
-	ADD 	$16, R13
+
 	// do an EABI syscall
 	MOVW	$20, R7 // sys_getpid
-	SWI 	$0 // this will trigger SIGILL on OABI systems
+	SWI	$0 // this will trigger SIGILL on OABI systems
+	
+	MOVW	$4, R0  // SIGILL
+	MOVW	R13, R1 // sa
+	MOVW	$0, R2 // old_sa
+	MOVW	$8, R3 // c
+	MOVW	$174, R7 // sys_sigaction
+	SWI	$0 // restore signal handler
+	ADD	$32, R13
 
+	SUB	$4, R13 // fake a stack frame for runtime·setup_auxv
+	BL	runtime·setup_auxv(SB)
+	ADD	$4, R13
 	B	_rt0_arm(SB)
 
 TEXT bad_abi<>(SB),7,$-4
diff --git a/src/pkg/runtime/rt0_netbsd_arm.s b/src/pkg/runtime/rt0_netbsd_arm.s
new file mode 100644
index 000000000..8c1588f2e
--- /dev/null
+++ b/src/pkg/runtime/rt0_netbsd_arm.s
@@ -0,0 +1,8 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// FreeBSD/NetBSD and Linux use the same linkage to main
+
+TEXT _rt0_arm_netbsd(SB),7,$-4
+	B _rt0_arm(SB)
diff --git a/src/pkg/runtime/rt0_plan9_386.s b/src/pkg/runtime/rt0_plan9_386.s
index b56c8b325..56f3a0f6c 100644
--- a/src/pkg/runtime/rt0_plan9_386.s
+++ b/src/pkg/runtime/rt0_plan9_386.s
@@ -25,6 +25,7 @@ argv_fix:
 	ADDL	$4, BP
 	LOOP	argv_fix
 	
+	CALL	runtime·asminit(SB)
 	JMP	_rt0_386(SB)
 
 DATA  runtime·isplan9(SB)/4, $1
diff --git a/src/pkg/runtime/rt0_plan9_amd64.s b/src/pkg/runtime/rt0_plan9_amd64.s
new file mode 100644
index 000000000..2b1fa2ae1
--- /dev/null
+++ b/src/pkg/runtime/rt0_plan9_amd64.s
@@ -0,0 +1,11 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+TEXT _rt0_amd64_plan9(SB),7, $0
+	MOVQ	$_rt0_amd64(SB), AX
+	MOVQ	SP, DI
+	JMP	AX
+
+DATA runtime·isplan9(SB)/4, $1
+GLOBL runtime·isplan9(SB), $4
diff --git a/src/pkg/runtime/rt0_windows_386.s b/src/pkg/runtime/rt0_windows_386.s
index 3b023de2f..a06aa787e 100644
--- a/src/pkg/runtime/rt0_windows_386.s
+++ b/src/pkg/runtime/rt0_windows_386.s
@@ -3,11 +3,6 @@
 // license that can be found in the LICENSE file.
 
 TEXT _rt0_386_windows(SB),7,$0
-	// Set up SEH frame for bootstrap m
-	PUSHL	$runtime·sigtramp(SB)
-	PUSHL	0(FS)
-	MOVL	SP, 0(FS)
-
 	JMP	_rt0_386(SB)
 
 DATA  runtime·iswindows(SB)/4, $1
diff --git a/src/pkg/runtime/rune.c b/src/pkg/runtime/rune.c
index 86ee76ddd..ed867269d 100644
--- a/src/pkg/runtime/rune.c
+++ b/src/pkg/runtime/rune.c
@@ -47,6 +47,9 @@ enum
 	Runeerror	= 0xFFFD,
 	Runeself	= 0x80,
 
+	SurrogateMin = 0xD800,
+	SurrogateMax = 0xDFFF,
+
 	Bad	= Runeerror,
 
 	Runemax	= 0x10FFFF,	/* maximum rune value */
@@ -128,6 +131,8 @@ runtime·charntorune(int32 *rune, uint8 *str, int32 length)
 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 		if(l <= Rune2)
 			goto bad;
+		if (SurrogateMin <= l && l <= SurrogateMax)
+			goto bad;
 		*rune = l;
 		return 3;
 	}
@@ -193,13 +198,15 @@ runtime·runetochar(byte *str, int32 rune)  /* note: in original, arg2 was point
 	}
 
 	/*
-	 * If the Rune is out of range, convert it to the error rune.
+	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
 	 * Do this test here because the error rune encodes to three bytes.
 	 * Doing it earlier would duplicate work, since an out of range
 	 * Rune wouldn't have fit in one or two bytes.
 	 */
 	if (c > Runemax)
 		c = Runeerror;
+	if (SurrogateMin <= c && c <= SurrogateMax)
+		c = Runeerror;
 
 	/*
 	 * three character sequence
diff --git a/src/pkg/runtime/runtime-gdb.py b/src/pkg/runtime/runtime-gdb.py
index 629c39e98..eff9a4003 100644
--- a/src/pkg/runtime/runtime-gdb.py
+++ b/src/pkg/runtime/runtime-gdb.py
@@ -149,8 +149,8 @@ goobjfile.pretty_printers.extend([makematcher(k) for k in vars().values() if has
 
 #
 #  For reference, this is what we're trying to do:
-#  eface: p *(*(struct 'runtime.commonType'*)'main.e'->type_->data)->string
-#  iface: p *(*(struct 'runtime.commonType'*)'main.s'->tab->Type->data)->string
+#  eface: p *(*(struct 'runtime.rtype'*)'main.e'->type_->data)->string
+#  iface: p *(*(struct 'runtime.rtype'*)'main.s'->tab->Type->data)->string
 #
 # interface types can't be recognized by their name, instead we check
 # if they have the expected fields.  Unfortunately the mapping of
@@ -186,8 +186,7 @@ def lookup_type(name):
 	except:
 		pass
 
-_rctp_type = gdb.lookup_type("struct runtime.commonType").pointer()
-_rtp_type = gdb.lookup_type("struct runtime._type").pointer()
+_rctp_type = gdb.lookup_type("struct runtime.rtype").pointer()
 
 def iface_commontype(obj):
 	if is_iface(obj):
@@ -196,18 +195,13 @@ def iface_commontype(obj):
 		go_type_ptr = obj['_type']
 	else:
 		return
-
-	# sanity check: reflection type description ends in a loop.
-	tt = go_type_ptr['_type'].cast(_rtp_type).dereference()['_type']
-	if tt != tt.cast(_rtp_type).dereference()['_type']:
-		return
 	
-	return go_type_ptr['ptr'].cast(_rctp_type).dereference()
+	return go_type_ptr.cast(_rctp_type).dereference()
 	
 
 def iface_dtype(obj):
 	"Decode type of the data field of an eface or iface struct."
-	# known issue: dtype_name decoded from runtime.commonType is "nested.Foo"
+	# known issue: dtype_name decoded from runtime.rtype is "nested.Foo"
 	# but the dwarf table lists it as "full/path/to/nested.Foo"
 
 	dynamic_go_type = iface_commontype(obj)
@@ -381,6 +375,7 @@ class GoroutineCmd(gdb.Command):
 
 	def invoke(self, arg, from_tty):
 		goid, cmd = arg.split(None, 1)
+		goid = gdb.parse_and_eval(goid)
 		pc, sp = find_goroutine(int(goid))
 		if not pc:
 			print "No such goroutine: ", goid
diff --git a/src/pkg/runtime/runtime.c b/src/pkg/runtime/runtime.c
index ebb5544fb..4d57cbafd 100644
--- a/src/pkg/runtime/runtime.c
+++ b/src/pkg/runtime/runtime.c
@@ -3,15 +3,12 @@
 // license that can be found in the LICENSE file.
 
 #include "runtime.h"
-#include "stack.h"
+#include "arch_GOARCH.h"
 
 enum {
 	maxround = sizeof(uintptr),
 };
 
-uint32	runtime·panicking;
-void	(*runtime·destroylock)(Lock*);
-
 /*
  * We assume that all architectures turn faults and the like
  * into apparent calls to runtime.sigpanic.  If we see a "call"
@@ -31,103 +28,6 @@ runtime·gotraceback(void)
 	return runtime·atoi(p);
 }
 
-static Lock paniclk;
-
-void
-runtime·startpanic(void)
-{
-	if(m->dying) {
-		runtime·printf("panic during panic\n");
-		runtime·exit(3);
-	}
-	m->dying = 1;
-	runtime·xadd(&runtime·panicking, 1);
-	runtime·lock(&paniclk);
-}
-
-void
-runtime·dopanic(int32 unused)
-{
-	static bool didothers;
-
-	if(g->sig != 0)
-		runtime·printf("[signal %x code=%p addr=%p pc=%p]\n",
-			g->sig, g->sigcode0, g->sigcode1, g->sigpc);
-
-	if(runtime·gotraceback()){
-		if(g != m->g0) {
-			runtime·printf("\n");
-			runtime·goroutineheader(g);
-			runtime·traceback(runtime·getcallerpc(&unused), runtime·getcallersp(&unused), 0, g);
-		}
-		if(!didothers) {
-			didothers = true;
-			runtime·tracebackothers(g);
-		}
-	}
-	runtime·unlock(&paniclk);
-	if(runtime·xadd(&runtime·panicking, -1) != 0) {
-		// Some other m is panicking too.
-		// Let it print what it needs to print.
-		// Wait forever without chewing up cpu.
-		// It will exit when it's done.
-		static Lock deadlock;
-		runtime·lock(&deadlock);
-		runtime·lock(&deadlock);
-	}
-
-	runtime·exit(2);
-}
-
-void
-runtime·panicindex(void)
-{
-	runtime·panicstring("index out of range");
-}
-
-void
-runtime·panicslice(void)
-{
-	runtime·panicstring("slice bounds out of range");
-}
-
-void
-runtime·throwreturn(void)
-{
-	// can only happen if compiler is broken
-	runtime·throw("no return at end of a typed function - compiler is broken");
-}
-
-void
-runtime·throwinit(void)
-{
-	// can only happen with linker skew
-	runtime·throw("recursive call during initialization - linker skew");
-}
-
-void
-runtime·throw(int8 *s)
-{
-	runtime·startpanic();
-	runtime·printf("throw: %s\n", s);
-	runtime·dopanic(0);
-	*(int32*)0 = 0;	// not reached
-	runtime·exit(1);	// even more not reached
-}
-
-void
-runtime·panicstring(int8 *s)
-{
-	Eface err;
-
-	if(m->gcing) {
-		runtime·printf("panic: %s\n", s);
-		runtime·throw("panic during gc");
-	}
-	runtime·newErrorString(runtime·gostringnocopy((byte*)s), &err);
-	runtime·panic(err);
-}
-
 int32
 runtime·mcmp(byte *s1, byte *s2, uint32 n)
 {
@@ -155,30 +55,21 @@ runtime·mchr(byte *p, byte c, byte *ep)
 	return nil;
 }
 
-uint32
-runtime·rnd(uint32 n, uint32 m)
-{
-	uint32 r;
-
-	if(m > maxround)
-		m = maxround;
-	r = n % m;
-	if(r)
-		n += m-r;
-	return n;
-}
-
 static int32	argc;
 static uint8**	argv;
 
 Slice os·Args;
 Slice syscall·envs;
 
+void (*runtime·sysargs)(int32, uint8**);
+
 void
 runtime·args(int32 c, uint8 **v)
 {
 	argc = c;
 	argv = v;
+	if(runtime·sysargs != nil)
+		runtime·sysargs(c, v);
 }
 
 int32 runtime·isplan9;
@@ -219,33 +110,6 @@ runtime·goenvs_unix(void)
 	syscall·envs.cap = n;
 }
 
-byte*
-runtime·getenv(int8 *s)
-{
-	int32 i, j, len;
-	byte *v, *bs;
-	String* envv;
-	int32 envc;
-
-	bs = (byte*)s;
-	len = runtime·findnull(bs);
-	envv = (String*)syscall·envs.array;
-	envc = syscall·envs.len;
-	for(i=0; i<envc; i++){
-		if(envv[i].len <= len)
-			continue;
-		v = envv[i].str;
-		for(j=0; j<len; j++)
-			if(bs[j] != v[j])
-				goto nomatch;
-		if(v[len] != '=')
-			goto nomatch;
-		return v+len+1;
-	nomatch:;
-	}
-	return nil;
-}
-
 void
 runtime·getgoroot(String out)
 {
@@ -267,6 +131,33 @@ runtime·atoi(byte *p)
 	return n;
 }
 
+static void
+TestAtomic64(void)
+{
+	uint64 z64, x64;
+
+	z64 = 42;
+	x64 = 0;
+	PREFETCH(&z64);
+	if(runtime·cas64(&z64, &x64, 1))
+		runtime·throw("cas64 failed");
+	if(x64 != 42)
+		runtime·throw("cas64 failed");
+	if(!runtime·cas64(&z64, &x64, 1))
+		runtime·throw("cas64 failed");
+	if(x64 != 42 || z64 != 1)
+		runtime·throw("cas64 failed");
+	if(runtime·atomicload64(&z64) != 1)
+		runtime·throw("load64 failed");
+	runtime·atomicstore64(&z64, (1ull<<40)+1);
+	if(runtime·atomicload64(&z64) != (1ull<<40)+1)
+		runtime·throw("store64 failed");
+	if(runtime·xadd64(&z64, (1ull<<40)+1) != (2ull<<40)+2)
+		runtime·throw("xadd64 failed");
+	if(runtime·atomicload64(&z64) != (2ull<<40)+2)
+		runtime·throw("xadd64 failed");
+}
+
 void
 runtime·check(void)
 {
@@ -342,10 +233,12 @@ runtime·check(void)
 		runtime·throw("float32nan2");
 	if(!(i != i1))
 		runtime·throw("float32nan3");
+
+	TestAtomic64();
 }
 
 void
-runtime·Caller(int32 skip, uintptr retpc, String retfile, int32 retline, bool retbool)
+runtime·Caller(intgo skip, uintptr retpc, String retfile, intgo retline, bool retbool)
 {
 	Func *f, *g;
 	uintptr pc;
@@ -382,7 +275,7 @@ runtime·Caller(int32 skip, uintptr retpc, String retfile, int32 retline, bool r
 }
 
 void
-runtime·Callers(int32 skip, Slice pc, int32 retn)
+runtime·Callers(intgo skip, Slice pc, intgo retn)
 {
 	// runtime.callers uses pc.array==nil as a signal
 	// to print a stack trace.  Pick off 0-length pc here
@@ -413,3 +306,40 @@ runtime·fastrand1(void)
 	m->fastrand = x;
 	return x;
 }
+
+static Lock ticksLock;
+static int64 ticks;
+
+int64
+runtime·tickspersecond(void)
+{
+	int64 res, t0, t1, c0, c1;
+
+	res = (int64)runtime·atomicload64((uint64*)&ticks);
+	if(res != 0)
+		return ticks;
+	runtime·lock(&ticksLock);
+	res = ticks;
+	if(res == 0) {
+		t0 = runtime·nanotime();
+		c0 = runtime·cputicks();
+		runtime·usleep(100*1000);
+		t1 = runtime·nanotime();
+		c1 = runtime·cputicks();
+		if(t1 == t0)
+			t1++;
+		res = (c1-c0)*1000*1000*1000/(t1-t0);
+		if(res == 0)
+			res++;
+		runtime·atomicstore64((uint64*)&ticks, res);
+	}
+	runtime·unlock(&ticksLock);
+	return res;
+}
+
+void
+runtime∕pprof·runtime_cyclesPerSecond(int64 res)
+{
+	res = runtime·tickspersecond();
+	FLUSH(&res);
+}
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index 6f5aea11d..08f43a69b 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -19,9 +19,13 @@ typedef	double			float64;
 #ifdef _64BIT
 typedef	uint64		uintptr;
 typedef	int64		intptr;
+typedef	int64		intgo; // Go's int
+typedef	uint64		uintgo; // Go's uint
 #else
 typedef	uint32		uintptr;
-typedef int32		intptr;
+typedef	int32		intptr;
+typedef	int32		intgo; // Go's int
+typedef	uint32		uintgo; // Go's uint
 #endif
 
 /*
@@ -48,44 +52,53 @@ typedef	struct	G		G;
 typedef	struct	Gobuf		Gobuf;
 typedef	union	Lock		Lock;
 typedef	struct	M		M;
+typedef	struct	P		P;
 typedef	struct	Mem		Mem;
 typedef	union	Note		Note;
 typedef	struct	Slice		Slice;
 typedef	struct	Stktop		Stktop;
 typedef	struct	String		String;
+typedef	struct	FuncVal		FuncVal;
 typedef	struct	SigTab		SigTab;
 typedef	struct	MCache		MCache;
 typedef	struct	FixAlloc	FixAlloc;
 typedef	struct	Iface		Iface;
 typedef	struct	Itab		Itab;
+typedef	struct	InterfaceType	InterfaceType;
 typedef	struct	Eface		Eface;
 typedef	struct	Type		Type;
 typedef	struct	ChanType		ChanType;
 typedef	struct	MapType		MapType;
 typedef	struct	Defer		Defer;
+typedef	struct	DeferChunk	DeferChunk;
 typedef	struct	Panic		Panic;
 typedef	struct	Hmap		Hmap;
 typedef	struct	Hchan		Hchan;
 typedef	struct	Complex64	Complex64;
 typedef	struct	Complex128	Complex128;
 typedef	struct	WinCall		WinCall;
+typedef	struct	SEH		SEH;
 typedef	struct	Timers		Timers;
 typedef	struct	Timer		Timer;
+typedef	struct	GCStats		GCStats;
+typedef	struct	LFNode		LFNode;
+typedef	struct	ParFor		ParFor;
+typedef	struct	ParForThread	ParForThread;
+typedef	struct	CgoMal		CgoMal;
 
 /*
- * per-cpu declaration.
- * "extern register" is a special storage class implemented by 6c, 8c, etc.
- * on machines with lots of registers, it allocates a register that will not be
- * used in generated code.  on the x86, it allocates a slot indexed by a
- * segment register.
+ * Per-CPU declaration.
  *
- * amd64: allocated downwards from R15
- * x86: allocated upwards from 0(GS)
- * arm: allocated downwards from R10
+ * "extern register" is a special storage class implemented by 6c, 8c, etc.
+ * On the ARM, it is an actual register; elsewhere it is a slot in thread-
+ * local storage indexed by a segment register. See zasmhdr in
+ * src/cmd/dist/buildruntime.c for details, and be aware that the linker may
+ * make further OS-specific changes to the compiler's output. For example,
+ * 6l/linux rewrites 0(GS) as -16(FS).
  *
- * every C file linked into a Go program must include runtime.h
- * so that the C compiler knows to avoid other uses of these registers.
- * the Go compilers know to avoid them.
+ * Every C file linked into a Go program must include runtime.h so that the
+ * C compiler (6c, 8c, etc.) knows to avoid other uses of these dedicated
+ * registers. The Go compiler (6g, 8g, etc.) knows to avoid them.
  */
 extern	register	G*	g;
 extern	register	M*	m;
@@ -105,14 +118,34 @@ enum
 	Grunning,
 	Gsyscall,
 	Gwaiting,
-	Gmoribund,
+	Gmoribund_unused,  // currently unused, but hardcoded in gdb scripts
 	Gdead,
 };
 enum
 {
+	// P status
+	Pidle,
+	Prunning,
+	Psyscall,
+	Pgcstop,
+	Pdead,
+};
+enum
+{
 	true	= 1,
 	false	= 0,
 };
+enum
+{
+	PtrSize = sizeof(void*),
+};
+enum
+{
+	// Per-M stack segment cache size.
+	StackCacheSize = 32,
+	// Global <-> per-M stack segment cache transfer batch size.
+	StackCacheBatch = 16,
+};
 
 /*
  * structures
@@ -130,7 +163,12 @@ union	Note
 struct String
 {
 	byte*	str;
-	int32	len;
+	intgo	len;
+};
+struct FuncVal
+{
+	void	(*fn)(void);
+	// variable-size, fn-specific data here
 };
 struct Iface
 {
@@ -156,47 +194,61 @@ struct Complex128
 struct	Slice
 {				// must not move anything
 	byte*	array;		// actual data
-	uint32	len;		// number of elements
-	uint32	cap;		// allocated number of elements
+	uintgo	len;		// number of elements
+	uintgo	cap;		// allocated number of elements
 };
 struct	Gobuf
 {
 	// The offsets of these fields are known to (hard-coded in) libmach.
-	byte*	sp;
+	uintptr	sp;
 	byte*	pc;
 	G*	g;
 };
+struct	GCStats
+{
+	// the struct must consist of only uint64's,
+	// because it is casted to uint64[].
+	uint64	nhandoff;
+	uint64	nhandoffcnt;
+	uint64	nprocyield;
+	uint64	nosyield;
+	uint64	nsleep;
+};
 struct	G
 {
-	byte*	stackguard;	// cannot move - also known to linker, libmach, runtime/cgo
-	byte*	stackbase;	// cannot move - also known to libmach, runtime/cgo
+	uintptr	stackguard;	// cannot move - also known to linker, libmach, runtime/cgo
+	uintptr	stackbase;	// cannot move - also known to libmach, runtime/cgo
 	Defer*	defer;
 	Panic*	panic;
 	Gobuf	sched;
-	byte*	gcstack;		// if status==Gsyscall, gcstack = stackbase to use during gc
-	byte*	gcsp;		// if status==Gsyscall, gcsp = sched.sp to use during gc
-	byte*	gcguard;		// if status==Gsyscall, gcguard = stackguard to use during gc
-	byte*	stack0;
-	byte*	entry;		// initial function
+	uintptr	gcstack;		// if status==Gsyscall, gcstack = stackbase to use during gc
+	uintptr	gcsp;		// if status==Gsyscall, gcsp = sched.sp to use during gc
+	byte*	gcpc;		// if status==Gsyscall, gcpc = sched.pc to use during gc
+	uintptr	gcguard;		// if status==Gsyscall, gcguard = stackguard to use during gc
+	uintptr	stack0;
+	FuncVal*	fnstart;		// initial function
 	G*	alllink;	// on allg
 	void*	param;		// passed parameter on wakeup
 	int16	status;
-	int32	goid;
+	int64	goid;
 	uint32	selgen;		// valid sudog pointer
 	int8*	waitreason;	// if status==Gwaiting
 	G*	schedlink;
-	bool	readyonstop;
 	bool	ispanic;
+	bool	issystem;
+	int8	raceignore; // ignore race detection events
 	M*	m;		// for debuggers, but offset not hard-coded
 	M*	lockedm;
-	M*	idlem;
 	int32	sig;
 	int32	writenbuf;
 	byte*	writebuf;
+	DeferChunk	*dchunk;
+	DeferChunk	*dchunknext;
 	uintptr	sigcode0;
 	uintptr	sigcode1;
 	uintptr	sigpc;
 	uintptr	gopc;	// pc of go statement that created this goroutine
+	uintptr	racectx;
 	uintptr	end[];
 };
 struct	M
@@ -213,43 +265,98 @@ struct	M
 	uintptr	cret;		// return value from C
 	uint64	procid;		// for debuggers, but offset not hard-coded
 	G*	gsignal;	// signal-handling G
-	uint32	tls[8];		// thread-local storage (for 386 extern register)
+	uintptr	tls[4];		// thread-local storage (for x86 extern register)
+	void	(*mstartfn)(void);
 	G*	curg;		// current running goroutine
+	P*	p;		// attached P for executing Go code (nil if not executing Go code)
+	P*	nextp;
 	int32	id;
 	int32	mallocing;
+	int32	throwing;
 	int32	gcing;
 	int32	locks;
 	int32	nomemprof;
-	int32	waitnextg;
 	int32	dying;
 	int32	profilehz;
 	int32	helpgc;
+	bool	blockingsyscall;
+	bool	spinning;
 	uint32	fastrand;
-	uint64	ncgocall;
-	Note	havenextg;
-	G*	nextg;
+	uint64	ncgocall;	// number of cgo calls in total
+	int32	ncgo;		// number of cgo calls currently in progress
+	CgoMal*	cgomal;
+	Note	park;
 	M*	alllink;	// on allm
 	M*	schedlink;
 	uint32	machport;	// Return address for Mach IPC (OS X)
 	MCache	*mcache;
-	FixAlloc	*stackalloc;
+	int32	stackinuse;
+	uint32	stackcachepos;
+	uint32	stackcachecnt;
+	void*	stackcache[StackCacheSize];
 	G*	lockedg;
-	G*	idleg;
 	uintptr	createstack[32];	// Stack that created this thread.
 	uint32	freglo[16];	// D[i] lsb and F[i]
 	uint32	freghi[16];	// D[i] msb and F[i+16]
 	uint32	fflag;		// floating point compare flags
+	uint32	locked;	// tracking for LockOSThread
 	M*	nextwaitm;	// next M waiting for lock
 	uintptr	waitsema;	// semaphore for parking on locks
 	uint32	waitsemacount;
 	uint32	waitsemalock;
+	GCStats	gcstats;
+	bool	racecall;
+	bool	needextram;
+	void*	racepc;
+	void	(*waitunlockf)(Lock*);
+	Lock*	waitlock;
+	uint32	moreframesize_minalloc;
+
+	uintptr	settype_buf[1024];
+	uintptr	settype_bufsize;
 
 #ifdef GOOS_windows
 	void*	thread;		// thread handle
 #endif
+#ifdef GOOS_plan9
+	int8*		notesig;
+#endif
+	SEH*	seh;
 	uintptr	end[];
 };
 
+struct P
+{
+	Lock;
+
+	uint32	status;  // one of Pidle/Prunning/...
+	P*	link;
+	uint32	tick;   // incremented on every scheduler or system call
+	M*	m;	// back-link to associated M (nil if idle)
+	MCache*	mcache;
+
+	// Queue of runnable goroutines.
+	G**	runq;
+	int32	runqhead;
+	int32	runqtail;
+	int32	runqsize;
+
+	// Available G's (status == Gdead)
+	G*	gfree;
+	int32	gfreecnt;
+
+	byte	pad[64];
+};
+
+// The m->locked word holds a single bit saying whether
+// external calls to LockOSThread are in effect, and then a counter
+// of the internal nesting depth of lockOSThread / unlockOSThread.
+enum
+{
+	LockExternal = 1,
+	LockInternal = 2,
+};
+
 struct	Stktop
 {
 	// The offsets of these fields are known to (hard-coded in) libmach.
@@ -288,8 +395,19 @@ struct	Func
 	uintptr	pc0;	// starting pc, ln for table
 	int32	ln0;
 	int32	frame;	// stack frame size
-	int32	args;	// number of 32-bit in/out args
-	int32	locals;	// number of 32-bit locals
+	int32	args;	// in/out args size
+	int32	locals;	// locals size
+};
+
+// layout of Itab known to compilers
+struct	Itab
+{
+	InterfaceType*	inter;
+	Type*	type;
+	Itab*	link;
+	int32	bad;
+	int32	unused;
+	void	(*fun[])(void);
 };
 
 struct	WinCall
@@ -301,6 +419,11 @@ struct	WinCall
 	uintptr	r2;
 	uintptr	err;	// error number
 };
+struct	SEH
+{
+	void*	prev;
+	void*	handler;
+};
 
 #ifdef GOOS_windows
 enum {
@@ -335,10 +458,46 @@ struct	Timer
 	// a well-behaved function and not block.
 	int64	when;
 	int64	period;
-	void	(*f)(int64, Eface);
+	FuncVal	*fv;
 	Eface	arg;
 };
 
+// Lock-free stack node.
+struct LFNode
+{
+	LFNode	*next;
+	uintptr	pushcnt;
+};
+
+// Parallel for descriptor.
+struct ParFor
+{
+	void (*body)(ParFor*, uint32);	// executed for each element
+	uint32 done;			// number of idle threads
+	uint32 nthr;			// total number of threads
+	uint32 nthrmax;			// maximum number of threads
+	uint32 thrseq;			// thread id sequencer
+	uint32 cnt;			// iteration space [0, cnt)
+	void *ctx;			// arbitrary user context
+	bool wait;			// if true, wait while all threads finish processing,
+					// otherwise parfor may return while other threads are still working
+	ParForThread *thr;		// array of thread descriptors
+	// stats
+	uint64 nsteal;
+	uint64 nstealcnt;
+	uint64 nprocyield;
+	uint64 nosyield;
+	uint64 nsleep;
+};
+
+// Track memory allocated by code not written in Go during a cgo call,
+// so that the garbage collector can see them.
+struct CgoMal
+{
+	CgoMal	*next;
+	byte	*alloc;
+};
+
 /*
  * defined macros
  *    you need super-gopher-guru privilege
@@ -347,6 +506,7 @@ struct	Timer
 #define	nelem(x)	(sizeof(x)/sizeof((x)[0]))
 #define	nil		((void*)0)
 #define	offsetof(s,m)	(uint32)(&(((s*)0)->m))
+#define	ROUND(x, n)	(((x)+(n)-1)&~((n)-1)) /* all-caps to mark as macro: it evaluates n twice */
 
 /*
  * known to compiler
@@ -430,12 +590,19 @@ void	runtime·nilintercopy(uintptr, void*, void*);
 struct Defer
 {
 	int32	siz;
-	bool	nofree;
+	bool	special; // not part of defer frame
+	bool	free; // if special, free when done
 	byte*	argp;  // where args were copied from
 	byte*	pc;
-	byte*	fn;
+	FuncVal*	fn;
 	Defer*	link;
-	byte	args[8];	// padded to actual size
+	void*	args[1];	// padded to actual size
+};
+
+struct DeferChunk
+{
+	DeferChunk	*prev;
+	uintptr	off;
 };
 
 /*
@@ -453,16 +620,21 @@ struct Panic
  * external data
  */
 extern	String	runtime·emptystring;
-G*	runtime·allg;
-G*	runtime·lastg;
-M*	runtime·allm;
+extern	uintptr runtime·zerobase;
+extern	G*	runtime·allg;
+extern	G*	runtime·lastg;
+extern	M*	runtime·allm;
+extern	P**	runtime·allp;
 extern	int32	runtime·gomaxprocs;
 extern	bool	runtime·singleproc;
 extern	uint32	runtime·panicking;
-extern	int32	runtime·gcwaiting;		// gc is waiting to run
-int8*	runtime·goos;
-int32	runtime·ncpu;
+extern	uint32	runtime·gcwaiting;		// gc is waiting to run
+extern	int8*	runtime·goos;
+extern	int32	runtime·ncpu;
 extern	bool	runtime·iscgo;
+extern 	void	(*runtime·sysargs)(int32, uint8**);
+extern	uint32	runtime·maxstring;
+extern	uint32	runtime·Hchansize;
 
 /*
  * common functions and data
@@ -481,7 +653,8 @@ int32	runtime·charntorune(int32*, uint8*, int32);
 #define FLUSH(x)	USED(x)
 
 void	runtime·gogo(Gobuf*, uintptr);
-void	runtime·gogocall(Gobuf*, void(*)(void));
+void	runtime·gogocall(Gobuf*, void(*)(void), uintptr);
+void	runtime·gogocallfn(Gobuf*, FuncVal*);
 void	runtime·gosave(Gobuf*);
 void	runtime·lessstack(void);
 void	runtime·goargs(void);
@@ -490,7 +663,6 @@ void	runtime·goenvs_unix(void);
 void*	runtime·getu(void);
 void	runtime·throw(int8*);
 void	runtime·panicstring(int8*);
-uint32	runtime·rnd(uint32, uint32);
 void	runtime·prints(int8*);
 void	runtime·printf(int8*, ...);
 byte*	runtime·mchr(byte*, byte, byte*);
@@ -499,8 +671,8 @@ void	runtime·memmove(void*, void*, uint32);
 void*	runtime·mal(uintptr);
 String	runtime·catstring(String, String);
 String	runtime·gostring(byte*);
-String  runtime·gostringn(byte*, int32);
-Slice	runtime·gobytes(byte*, int32);
+String  runtime·gostringn(byte*, intgo);
+Slice	runtime·gobytes(byte*, intgo);
 String	runtime·gostringnocopy(byte*);
 String	runtime·gostringw(uint16*);
 void	runtime·initsig(void);
@@ -512,38 +684,47 @@ void	runtime·tracebackothers(G*);
 int32	runtime·write(int32, void*, int32);
 int32	runtime·mincore(void*, uintptr, byte*);
 bool	runtime·cas(uint32*, uint32, uint32);
+bool	runtime·cas64(uint64*, uint64*, uint64);
 bool	runtime·casp(void**, void*, void*);
 // Don't confuse with XADD x86 instruction,
 // this one is actually 'addx', that is, add-and-fetch.
 uint32	runtime·xadd(uint32 volatile*, int32);
+uint64	runtime·xadd64(uint64 volatile*, int64);
 uint32	runtime·xchg(uint32 volatile*, uint32);
 uint32	runtime·atomicload(uint32 volatile*);
 void	runtime·atomicstore(uint32 volatile*, uint32);
+void	runtime·atomicstore64(uint64 volatile*, uint64);
+uint64	runtime·atomicload64(uint64 volatile*);
 void*	runtime·atomicloadp(void* volatile*);
 void	runtime·atomicstorep(void* volatile*, void*);
-void	runtime·jmpdefer(byte*, void*);
+void	runtime·jmpdefer(FuncVal*, void*);
 void	runtime·exit1(int32);
 void	runtime·ready(G*);
 byte*	runtime·getenv(int8*);
 int32	runtime·atoi(byte*);
-void	runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void));
-void	runtime·signalstack(byte*, int32);
+void	runtime·newosproc(M *mp, void *stk);
+void	runtime·mstart(void);
 G*	runtime·malg(int32);
 void	runtime·asminit(void);
+void	runtime·mpreinit(M*);
 void	runtime·minit(void);
+void	runtime·unminit(void);
+void	runtime·signalstack(byte*, int32);
 Func*	runtime·findfunc(uintptr);
 int32	runtime·funcline(Func*, uintptr);
 void*	runtime·stackalloc(uint32);
 void	runtime·stackfree(void*, uintptr);
 MCache*	runtime·allocmcache(void);
+void	runtime·freemcache(MCache*);
 void	runtime·mallocinit(void);
+void	runtime·mprofinit(void);
 bool	runtime·ifaceeq_c(Iface, Iface);
 bool	runtime·efaceeq_c(Eface, Eface);
-uintptr	runtime·ifacehash(Iface);
-uintptr	runtime·efacehash(Eface);
+uintptr	runtime·ifacehash(Iface, uintptr);
+uintptr	runtime·efacehash(Eface, uintptr);
 void*	runtime·malloc(uintptr size);
 void	runtime·free(void *v);
-bool	runtime·addfinalizer(void*, void(*fn)(void*), int32);
+bool	runtime·addfinalizer(void*, FuncVal *fn, uintptr);
 void	runtime·runpanic(Panic*);
 void*	runtime·getcallersp(void*);
 int32	runtime·mcount(void);
@@ -551,27 +732,35 @@ int32	runtime·gcount(void);
 void	runtime·mcall(void(*)(G*));
 uint32	runtime·fastrand1(void);
 
+void runtime·setmg(M*, G*);
+void runtime·newextram(void);
 void	runtime·exit(int32);
 void	runtime·breakpoint(void);
 void	runtime·gosched(void);
-void	runtime·tsleep(int64);
+void	runtime·park(void(*)(Lock*), Lock*, int8*);
+void	runtime·tsleep(int64, int8*);
 M*	runtime·newm(void);
 void	runtime·goexit(void);
 void	runtime·asmcgocall(void (*fn)(void*), void*);
 void	runtime·entersyscall(void);
+void	runtime·entersyscallblock(void);
 void	runtime·exitsyscall(void);
-G*	runtime·newproc1(byte*, byte*, int32, int32, void*);
+G*	runtime·newproc1(FuncVal*, byte*, int32, int32, void*);
 bool	runtime·sigsend(int32 sig);
 int32	runtime·callers(int32, uintptr*, int32);
 int32	runtime·gentraceback(byte*, byte*, byte*, G*, int32, uintptr*, int32);
 int64	runtime·nanotime(void);
 void	runtime·dopanic(int32);
 void	runtime·startpanic(void);
+void	runtime·unwindstack(G*, byte*);
 void	runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp);
 void	runtime·resetcpuprofiler(int32);
 void	runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32);
 void	runtime·usleep(uint32);
 int64	runtime·cputicks(void);
+int64	runtime·tickspersecond(void);
+void	runtime·blockevent(int64, int32);
+extern int64 runtime·blockprofilerate;
 
 #pragma	varargck	argpos	runtime·printf	1
 #pragma	varargck	type	"d"	int32
@@ -589,7 +778,7 @@ int64	runtime·cputicks(void);
 #pragma	varargck	type	"S"	String
 
 void	runtime·stoptheworld(void);
-void	runtime·starttheworld(bool);
+void	runtime·starttheworld(void);
 extern uint32 runtime·worldsema;
 
 /*
@@ -636,6 +825,27 @@ void	runtime·futexsleep(uint32*, uint32, int64);
 void	runtime·futexwakeup(uint32*, uint32);
 
 /*
+ * Lock-free stack.
+ * Initialize uint64 head to 0, compare with 0 to test for emptiness.
+ * The stack does not keep pointers to nodes,
+ * so they can be garbage collected if there are no other pointers to nodes.
+ */
+void	runtime·lfstackpush(uint64 *head, LFNode *node);
+LFNode*	runtime·lfstackpop(uint64 *head);
+
+/*
+ * Parallel for over [0, n).
+ * body() is executed for each iteration.
+ * nthr - total number of worker threads.
+ * ctx - arbitrary user context.
+ * if wait=true, threads return from parfor() when all work is done;
+ * otherwise, threads can return while other threads are still finishing processing.
+ */
+ParFor*	runtime·parforalloc(uint32 nthrmax);
+void	runtime·parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32));
+void	runtime·parfordo(ParFor *desc);
+
+/*
  * This is consistent across Linux and BSD.
  * If a new OS is added that is different, move this to
  * $GOOS/$GOARCH/defs.h.
@@ -645,6 +855,9 @@ void	runtime·futexwakeup(uint32*, uint32);
 /*
  * low level C-called
  */
+// for mmap, we only pass the lower 32 bits of file offset to the 
+// assembly routine; the higher bits (if required), should be provided
+// by the assembly routine as 0.
 uint8*	runtime·mmap(byte*, uintptr, int32, int32, int32, uint32);
 void	runtime·munmap(byte*, uintptr);
 void	runtime·madvise(byte*, uintptr, int32);
@@ -656,6 +869,7 @@ void*	runtime·getcallerpc(void*);
  * runtime go-called
  */
 void	runtime·printbool(bool);
+void	runtime·printbyte(int8);
 void	runtime·printfloat(float64);
 void	runtime·printint(int64);
 void	runtime·printiface(Iface);
@@ -667,7 +881,7 @@ void	runtime·printuint(uint64);
 void	runtime·printhex(uint64);
 void	runtime·printslice(Slice);
 void	runtime·printcomplex(Complex128);
-void	reflect·call(byte*, byte*, uint32);
+void	reflect·call(FuncVal*, byte*, uint32);
 void	runtime·panic(Eface);
 void	runtime·panicindex(void);
 void	runtime·panicslice(void);
@@ -708,8 +922,8 @@ void	runtime·semrelease(uint32*);
 int32	runtime·gomaxprocsfunc(int32 n);
 void	runtime·procyield(uint32);
 void	runtime·osyield(void);
-void	runtime·LockOSThread(void);
-void	runtime·UnlockOSThread(void);
+void	runtime·lockOSThread(void);
+void	runtime·unlockOSThread(void);
 
 void	runtime·mapassign(MapType*, Hmap*, byte*, byte*);
 void	runtime·mapaccess(MapType*, Hmap*, byte*, byte*, bool*);
@@ -719,13 +933,11 @@ void	runtime·mapiterkeyvalue(struct hash_iter*, void*, void*);
 Hmap*	runtime·makemap_c(MapType*, int64);
 
 Hchan*	runtime·makechan_c(ChanType*, int64);
-void	runtime·chansend(ChanType*, Hchan*, byte*, bool*);
+void	runtime·chansend(ChanType*, Hchan*, byte*, bool*, void*);
 void	runtime·chanrecv(ChanType*, Hchan*, byte*, bool*, bool*);
-int32	runtime·chanlen(Hchan*);
-int32	runtime·chancap(Hchan*);
-bool	runtime·showframe(Func*);
+bool	runtime·showframe(Func*, bool);
 
-void	runtime·ifaceE2I(struct InterfaceType*, Eface, Iface*);
+void	runtime·ifaceE2I(InterfaceType*, Eface, Iface*);
 
 uintptr	runtime·memlimit(void);
 
@@ -738,3 +950,17 @@ uintptr	runtime·memlimit(void);
 // is forced to deliver the signal to a thread that's actually running.
 // This is a no-op on other systems.
 void	runtime·setprof(bool);
+
+// float.c
+extern float64 runtime·nan;
+extern float64 runtime·posinf;
+extern float64 runtime·neginf;
+extern uint64 ·nan;
+extern uint64 ·posinf;
+extern uint64 ·neginf;
+#define ISNAN(f) ((f) != (f))
+
+enum
+{
+	UseSpanType = 1,
+};
diff --git a/src/pkg/runtime/runtime1.goc b/src/pkg/runtime/runtime1.goc
index 667131c1e..d2c38dfef 100644
--- a/src/pkg/runtime/runtime1.goc
+++ b/src/pkg/runtime/runtime1.goc
@@ -5,10 +5,10 @@
 package runtime
 #include "runtime.h"
 
-func GOMAXPROCS(n int32) (ret int32) {
+func GOMAXPROCS(n int) (ret int) {
 	ret = runtime·gomaxprocsfunc(n);
 }
 
-func NumCPU() (ret int32) {
+func NumCPU() (ret int) {
 	ret = runtime·ncpu;
 }
diff --git a/src/pkg/runtime/runtime_test.go b/src/pkg/runtime/runtime_test.go
index d68b363e9..e45879349 100644
--- a/src/pkg/runtime/runtime_test.go
+++ b/src/pkg/runtime/runtime_test.go
@@ -38,3 +38,44 @@ func BenchmarkIfaceCmpNil100(b *testing.B) {
 		}
 	}
 }
+
+func BenchmarkDefer(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		defer1()
+	}
+}
+
+func defer1() {
+	defer func(x, y, z int) {
+		if recover() != nil || x != 1 || y != 2 || z != 3 {
+			panic("bad recover")
+		}
+	}(1, 2, 3)
+	return
+}
+
+func BenchmarkDefer10(b *testing.B) {
+	for i := 0; i < b.N/10; i++ {
+		defer2()
+	}
+}
+
+func defer2() {
+	for i := 0; i < 10; i++ {
+		defer func(x, y, z int) {
+			if recover() != nil || x != 1 || y != 2 || z != 3 {
+				panic("bad recover")
+			}
+		}(1, 2, 3)
+	}
+}
+
+func BenchmarkDeferMany(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		defer func(x, y, z int) {
+			if recover() != nil || x != 1 || y != 2 || z != 3 {
+				panic("bad recover")
+			}
+		}(1, 2, 3)
+	}
+}
diff --git a/src/pkg/runtime/sema.goc b/src/pkg/runtime/sema.goc
index 2300c56aa..c4b5247b3 100644
--- a/src/pkg/runtime/sema.goc
+++ b/src/pkg/runtime/sema.goc
@@ -24,30 +24,33 @@ package sync
 typedef struct Sema Sema;
 struct Sema
 {
-	uint32 volatile *addr;
-	G *g;
-	Sema *prev;
-	Sema *next;
+	uint32 volatile*	addr;
+	G*	g;
+	int64	releasetime;
+	Sema*	prev;
+	Sema*	next;
 };
 
 typedef struct SemaRoot SemaRoot;
 struct SemaRoot
 {
-        Lock;
-	Sema *head;
-	Sema *tail;
+	Lock;
+	Sema*	head;
+	Sema*	tail;
 	// Number of waiters. Read w/o the lock.
-	uint32 volatile nwait;
+	uint32 volatile	nwait;
 };
 
 // Prime to not correlate with any user patterns.
 #define SEMTABLESZ 251
 
-static union
+union semtable
 {
 	SemaRoot;
 	uint8 pad[CacheLineSize];
-} semtable[SEMTABLESZ];
+};
+#pragma dataflag 16 /* mark semtable as 'no pointers', hiding from garbage collector */
+static union semtable semtable[SEMTABLESZ];
 
 static SemaRoot*
 semroot(uint32 *addr)
@@ -95,12 +98,13 @@ cansemacquire(uint32 *addr)
 	return 0;
 }
 
-void
-runtime·semacquire(uint32 volatile *addr)
+static void
+semacquireimpl(uint32 volatile *addr, int32 profile)
 {
-	Sema s;
+	Sema s;	// Needs to be allocated on stack, otherwise garbage collector could deallocate it
 	SemaRoot *root;
-
+	int64 t0;
+	
 	// Easy case.
 	if(cansemacquire(addr))
 		return;
@@ -112,6 +116,12 @@ runtime·semacquire(uint32 volatile *addr)
 	//	sleep
 	//	(waiter descriptor is dequeued by signaler)
 	root = semroot(addr);
+	t0 = 0;
+	s.releasetime = 0;
+	if(profile && runtime·blockprofilerate > 0) {
+		t0 = runtime·cputicks();
+		s.releasetime = -1;
+	}
 	for(;;) {
 		runtime·lock(root);
 		// Add ourselves to nwait to disable "easy case" in semrelease.
@@ -125,16 +135,22 @@ runtime·semacquire(uint32 volatile *addr)
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
 		semqueue(root, addr, &s);
-		g->status = Gwaiting;
-		g->waitreason = "semacquire";
-		runtime·unlock(root);
-		runtime·gosched();
-		if(cansemacquire(addr))
+		runtime·park(runtime·unlock, root, "semacquire");
+		if(cansemacquire(addr)) {
+			if(t0)
+				runtime·blockevent(s.releasetime - t0, 3);
 			return;
+		}
 	}
 }
 
 void
+runtime·semacquire(uint32 volatile *addr)
+{
+	semacquireimpl(addr, 0);
+}
+
+void
 runtime·semrelease(uint32 volatile *addr)
 {
 	Sema *s;
@@ -165,12 +181,15 @@ runtime·semrelease(uint32 volatile *addr)
 		}
 	}
 	runtime·unlock(root);
-	if(s)
+	if(s) {
+		if(s->releasetime)
+			s->releasetime = runtime·cputicks();
 		runtime·ready(s->g);
+	}
 }
 
 func runtime_Semacquire(addr *uint32) {
-	runtime·semacquire(addr);
+	semacquireimpl(addr, 1);
 }
 
 func runtime_Semrelease(addr *uint32) {
diff --git a/src/pkg/runtime/signal_darwin_386.c b/src/pkg/runtime/signal_darwin_386.c
index 9e986352b..132ca931b 100644
--- a/src/pkg/runtime/signal_darwin_386.c
+++ b/src/pkg/runtime/signal_darwin_386.c
@@ -47,7 +47,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Work around Leopard bug that doesn't set FPE_INTDIV.
 		// Look at instruction to see if it is a divide.
@@ -101,7 +101,11 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 	}
 
-	runtime·printf("pc: %x\n", r->eip);
+	runtime·printf("PC=%x\n", r->eip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}	
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -121,6 +125,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -129,6 +135,15 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(*(void**)sa.__sigaction_u == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
diff --git a/src/pkg/runtime/signal_darwin_amd64.c b/src/pkg/runtime/signal_darwin_amd64.c
index d9c5f48e7..4b7256bf4 100644
--- a/src/pkg/runtime/signal_darwin_amd64.c
+++ b/src/pkg/runtime/signal_darwin_amd64.c
@@ -55,7 +55,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Work around Leopard bug that doesn't set FPE_INTDIV.
 		// Look at instruction to see if it is a divide.
@@ -111,7 +111,11 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 	}
 
-	runtime·printf("pc: %X\n", r->rip);
+	runtime·printf("PC=%X\n", r->rip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -131,6 +135,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -139,6 +145,15 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(*(void**)sa.__sigaction_u == SIG_IGN)
+			return;
+	}
+		
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
diff --git a/src/pkg/runtime/signal_freebsd_386.c b/src/pkg/runtime/signal_freebsd_386.c
index 80da95d98..254e5e277 100644
--- a/src/pkg/runtime/signal_freebsd_386.c
+++ b/src/pkg/runtime/signal_freebsd_386.c
@@ -15,7 +15,7 @@ typedef struct sigaction {
 		void    (*__sa_sigaction)(int32, Siginfo*, void *);
 	} __sigaction_u;		/* signal handler */
 	int32	sa_flags;		/* see signal options below */
-	int64	sa_mask;		/* signal mask to apply */
+	Sigset	sa_mask;		/* signal mask to apply */
 } Sigaction;
 
 void
@@ -54,7 +54,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -97,6 +97,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", r->mc_eip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -116,6 +120,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = (int8*)p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -124,11 +130,23 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa.__sigaction_u.__sa_sigaction == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
 		sa.sa_flags |= SA_RESTART;
-	sa.sa_mask = ~0ULL;
+	sa.sa_mask.__bits[0] = ~(uint32)0;
+	sa.sa_mask.__bits[1] = ~(uint32)0;
+	sa.sa_mask.__bits[2] = ~(uint32)0;
+	sa.sa_mask.__bits[3] = ~(uint32)0;
 	if (fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
 	sa.__sigaction_u.__sa_sigaction = (void*)fn;
diff --git a/src/pkg/runtime/signal_freebsd_amd64.c b/src/pkg/runtime/signal_freebsd_amd64.c
index e4307682f..7dbf36075 100644
--- a/src/pkg/runtime/signal_freebsd_amd64.c
+++ b/src/pkg/runtime/signal_freebsd_amd64.c
@@ -15,7 +15,7 @@ typedef struct sigaction {
 		void    (*__sa_sigaction)(int32, Siginfo*, void *);
 	} __sigaction_u;		/* signal handler */
 	int32	sa_flags;		/* see signal options below */
-	int64	sa_mask;		/* signal mask to apply */
+	Sigset	sa_mask;		/* signal mask to apply */
 } Sigaction;
 
 void
@@ -62,7 +62,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -105,6 +105,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", r->mc_rip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -124,6 +128,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = (int8*)p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -132,11 +138,23 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa.__sigaction_u.__sa_sigaction == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
 		sa.sa_flags |= SA_RESTART;
-	sa.sa_mask = ~0ULL;
+	sa.sa_mask.__bits[0] = ~(uint32)0;
+	sa.sa_mask.__bits[1] = ~(uint32)0;
+	sa.sa_mask.__bits[2] = ~(uint32)0;
+	sa.sa_mask.__bits[3] = ~(uint32)0;
 	if (fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
 	sa.__sigaction_u.__sa_sigaction = (void*)fn;
diff --git a/src/pkg/runtime/signal_freebsd_arm.c b/src/pkg/runtime/signal_freebsd_arm.c
new file mode 100644
index 000000000..50c3221bb
--- /dev/null
+++ b/src/pkg/runtime/signal_freebsd_arm.c
@@ -0,0 +1,193 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "signals_GOOS.h"
+#include "os_GOOS.h"
+
+#define r0	__gregs[0]
+#define r1	__gregs[1]
+#define r2	__gregs[2]
+#define r3	__gregs[3]
+#define r4	__gregs[4]
+#define r5	__gregs[5]
+#define r6	__gregs[6]
+#define r7	__gregs[7]
+#define r8	__gregs[8]
+#define r9	__gregs[9]
+#define r10	__gregs[10]
+#define r11	__gregs[11]
+#define r12	__gregs[12]
+#define r13	__gregs[13]
+#define r14	__gregs[14]
+#define r15	__gregs[15]
+#define cpsr	__gregs[16]
+
+void
+runtime·dumpregs(Mcontext *r)
+{
+	runtime·printf("r0      %x\n", r->r0);
+	runtime·printf("r1      %x\n", r->r1);
+	runtime·printf("r2      %x\n", r->r2);
+	runtime·printf("r3      %x\n", r->r3);
+	runtime·printf("r4      %x\n", r->r4);
+	runtime·printf("r5      %x\n", r->r5);
+	runtime·printf("r6      %x\n", r->r6);
+	runtime·printf("r7      %x\n", r->r7);
+	runtime·printf("r8      %x\n", r->r8);
+	runtime·printf("r9      %x\n", r->r9);
+	runtime·printf("r10     %x\n", r->r10);
+	runtime·printf("fp      %x\n", r->r11);
+	runtime·printf("ip      %x\n", r->r12);
+	runtime·printf("sp      %x\n", r->r13);
+	runtime·printf("lr      %x\n", r->r14);
+	runtime·printf("pc      %x\n", r->r15);
+	runtime·printf("cpsr    %x\n", r->cpsr);
+}
+
+extern void runtime·sigtramp(void);
+
+typedef struct sigaction {
+	union {
+		void    (*__sa_handler)(int32);
+		void    (*__sa_sigaction)(int32, Siginfo*, void *);
+	} __sigaction_u;		/* signal handler */
+	int32	sa_flags;		/* see signal options below */
+	Sigset	sa_mask;		/* signal mask to apply */
+} Sigaction;
+
+void
+runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
+{
+	Ucontext *uc;
+	Mcontext *r;
+	SigTab *t;
+
+	uc = context;
+	r = &uc->uc_mcontext;
+
+	if(sig == SIGPROF) {
+		runtime·sigprof((uint8*)r->r15, (uint8*)r->r13, (uint8*)r->r14, gp);
+		return;
+	}
+
+	t = &runtime·sigtab[sig];
+	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
+		if(gp == nil || gp == m->g0)
+			goto Throw;
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp->sig = sig;
+		gp->sigcode0 = info->si_code;
+		gp->sigcode1 = (uintptr)info->si_addr;
+		gp->sigpc = r->r15;
+
+		// Only push runtime·sigpanic if r->mc_rip != 0.
+		// If r->mc_rip == 0, probably panicked because of a
+		// call to a nil func.  Not pushing that onto sp will
+		// make the trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic and we
+		// won't get to see who faulted.)
+		if(r->r15 != 0)
+			r->r14 = r->r15;
+		// In case we are panicking from external C code
+		r->r10 = (uintptr)gp;
+		r->r9 = (uintptr)m;
+		r->r15 = (uintptr)runtime·sigpanic;
+		return;
+	}
+
+	if(info->si_code == SI_USER || (t->flags & SigNotify))
+		if(runtime·sigsend(sig))
+			return;
+	if(t->flags & SigKill)
+		runtime·exit(2);
+	if(!(t->flags & SigThrow))
+		return;
+
+Throw:
+	runtime·startpanic();
+
+	if(sig < 0 || sig >= NSIG)
+		runtime·printf("Signal %d\n", sig);
+	else
+		runtime·printf("%s\n", runtime·sigtab[sig].name);
+
+	runtime·printf("PC=%x\n", r->r15);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback()){
+		runtime·traceback((void*)r->r15, (void*)r->r13, (void*)r->r14, gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(r);
+	}
+
+//	breakpoint();
+	runtime·exit(2);
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	Sigaltstack st;
+
+	st.ss_sp = (uint8*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
+{
+	Sigaction sa;
+
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa.__sigaction_u.__sa_sigaction == SIG_IGN)
+			return;
+	}
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask.__bits[0] = ~(uint32)0;
+	sa.sa_mask.__bits[1] = ~(uint32)0;
+	sa.sa_mask.__bits[2] = ~(uint32)0;
+	sa.sa_mask.__bits[3] = ~(uint32)0;
+	if (fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa.__sigaction_u.__sa_sigaction = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+void
+runtime·checkgoarm(void)
+{
+	// TODO(minux)
+}
+
+#pragma textflag 7
+int64
+runtime·cputicks(void)
+{
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// TODO: need more entropy to better seed fastrand1.
+	return runtime·nanotime();
+}
diff --git a/src/pkg/runtime/signal_linux_386.c b/src/pkg/runtime/signal_linux_386.c
index b154ad887..9b45ec3bd 100644
--- a/src/pkg/runtime/signal_linux_386.c
+++ b/src/pkg/runtime/signal_linux_386.c
@@ -50,7 +50,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -93,6 +93,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", r->eip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -112,6 +116,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -120,6 +126,16 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		if(runtime·rt_sigaction(i, nil, &sa, sizeof(sa.sa_mask)) != 0)
+			runtime·throw("rt_sigaction read failure");
+		if(sa.k_sa_handler == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTORER;
 	if(restart)
@@ -129,7 +145,8 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 	if(fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
 	sa.k_sa_handler = fn;
-	runtime·rt_sigaction(i, &sa, nil, 8);
+	if(runtime·rt_sigaction(i, &sa, nil, sizeof(sa.sa_mask)) != 0)
+		runtime·throw("rt_sigaction failure");
 }
 
 #define AT_NULL		0
diff --git a/src/pkg/runtime/signal_linux_amd64.c b/src/pkg/runtime/signal_linux_amd64.c
index 14095ba61..c4e39a6ab 100644
--- a/src/pkg/runtime/signal_linux_amd64.c
+++ b/src/pkg/runtime/signal_linux_amd64.c
@@ -60,7 +60,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -103,6 +103,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", r->rip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -122,6 +126,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -130,14 +136,27 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		if(runtime·rt_sigaction(i, nil, &sa, sizeof(sa.sa_mask)) != 0)
+			runtime·throw("rt_sigaction read failure");
+		if(sa.sa_handler == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTORER;
 	if(restart)
 		sa.sa_flags |= SA_RESTART;
 	sa.sa_mask = ~0ULL;
+	// TODO(adonovan): Linux manpage says "sa_restorer element is
+	// obsolete and should not be used".  Avoid it here, and test.
 	sa.sa_restorer = (void*)runtime·sigreturn;
 	if(fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
 	sa.sa_handler = fn;
-	runtime·rt_sigaction(i, &sa, nil, 8);
+	if(runtime·rt_sigaction(i, &sa, nil, sizeof(sa.sa_mask)) != 0)
+		runtime·throw("rt_sigaction failure");
 }
diff --git a/src/pkg/runtime/signal_linux_arm.c b/src/pkg/runtime/signal_linux_arm.c
index 176a4ce56..c26caa7cd 100644
--- a/src/pkg/runtime/signal_linux_arm.c
+++ b/src/pkg/runtime/signal_linux_arm.c
@@ -57,7 +57,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -68,13 +68,22 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 		gp->sigcode1 = r->fault_address;
 		gp->sigpc = r->arm_pc;
 
-		// If this is a leaf function, we do smash LR,
-		// but we're not going back there anyway.
-		// Don't bother smashing if r->arm_pc is 0,
-		// which is probably a call to a nil func: the
-		// old link register is more useful in the stack trace.
+		// We arrange lr, and pc to pretend the panicking
+		// function calls sigpanic directly.
+		// Always save LR to stack so that panics in leaf
+		// functions are correctly handled. This smashes
+		// the stack frame but we're not going back there
+		// anyway.
+		r->arm_sp -= 4;
+		*(uint32 *)r->arm_sp = r->arm_lr;
+		// Don't bother saving PC if it's zero, which is
+		// probably a call to a nil func: the old link register
+		// is more useful in the stack trace.
 		if(r->arm_pc != 0)
 			r->arm_lr = r->arm_pc;
+		// In case we are panicking from external C code
+		r->arm_r10 = (uintptr)gp;
+		r->arm_r9 = (uintptr)m;
 		r->arm_pc = (uintptr)runtime·sigpanic;
 		return;
 	}
@@ -98,6 +107,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%x\n", r->arm_pc);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -119,6 +132,8 @@ runtime·signalstack(byte *p, int32 n)
 	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -127,6 +142,16 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		if(runtime·rt_sigaction(i, nil, &sa, sizeof(sa.sa_mask)) != 0)
+			runtime·throw("rt_sigaction read failure");
+		if(sa.sa_handler == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTORER;
 	if(restart)
@@ -136,5 +161,81 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 	if(fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
 	sa.sa_handler = fn;
-	runtime·rt_sigaction(i, &sa, nil, 8);
+	if(runtime·rt_sigaction(i, &sa, nil, sizeof(sa.sa_mask)) != 0)
+		runtime·throw("rt_sigaction failure");
+}
+
+#define AT_NULL		0
+#define AT_PLATFORM	15 // introduced in at least 2.6.11
+#define AT_HWCAP	16 // introduced in at least 2.6.11
+#define AT_RANDOM	25 // introduced in 2.6.29
+#define HWCAP_VFP	(1 << 6) // introduced in at least 2.6.11
+#define HWCAP_VFPv3	(1 << 13) // introduced in 2.6.30
+static uint32 runtime·randomNumber;
+uint8  runtime·armArch = 6;	// we default to ARMv6
+uint32 runtime·hwcap;	// set by setup_auxv
+uint8  runtime·goarm;	// set by 5l
+
+void
+runtime·checkgoarm(void)
+{
+	if(runtime·goarm > 5 && !(runtime·hwcap & HWCAP_VFP)) {
+		runtime·printf("runtime: this CPU has no floating point hardware, so it cannot run\n");
+		runtime·printf("this GOARM=%d binary. Recompile using GOARM=5.\n", runtime·goarm);
+		runtime·exit(1);
+	}
+	if(runtime·goarm > 6 && !(runtime·hwcap & HWCAP_VFPv3)) {
+		runtime·printf("runtime: this CPU has no VFPv3 floating point hardware, so it cannot run\n");
+		runtime·printf("this GOARM=%d binary. Recompile using GOARM=6.\n", runtime·goarm);
+		runtime·exit(1);
+	}
+}
+
+#pragma textflag 7
+void
+runtime·setup_auxv(int32 argc, void *argv_list)
+{
+	byte **argv;
+	byte **envp;
+	byte *rnd;
+	uint32 *auxv;
+	uint32 t;
+
+	argv = &argv_list;
+
+	// skip envp to get to ELF auxiliary vector.
+	for(envp = &argv[argc+1]; *envp != nil; envp++)
+		;
+	envp++;
+	
+	for(auxv=(uint32*)envp; auxv[0] != AT_NULL; auxv += 2) {
+		switch(auxv[0]) {
+		case AT_RANDOM: // kernel provided 16-byte worth of random data
+			if(auxv[1]) {
+				rnd = (byte*)auxv[1];
+				runtime·randomNumber = rnd[4] | rnd[5]<<8 | rnd[6]<<16 | rnd[7]<<24;
+			}
+			break;
+		case AT_PLATFORM: // v5l, v6l, v7l
+			if(auxv[1]) {
+				t = *(uint8*)(auxv[1]+1);
+				if(t >= '5' && t <= '7')
+					runtime·armArch = t - '0';
+			}
+			break;
+		case AT_HWCAP: // CPU capability bit flags
+			runtime·hwcap = auxv[1];
+			break;
+		}
+	}
+}
+
+#pragma textflag 7
+int64
+runtime·cputicks(void)
+{
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// runtime·randomNumber provides better seeding of fastrand1.
+	return runtime·nanotime() + runtime·randomNumber;
 }
diff --git a/src/pkg/runtime/signal_netbsd_386.c b/src/pkg/runtime/signal_netbsd_386.c
index 39d829484..08744c425 100644
--- a/src/pkg/runtime/signal_netbsd_386.c
+++ b/src/pkg/runtime/signal_netbsd_386.c
@@ -7,76 +7,79 @@
 #include "signals_GOOS.h"
 #include "os_GOOS.h"
 
+extern void runtime·lwp_tramp(void);
 extern void runtime·sigtramp(void);
 
 typedef struct sigaction {
 	union {
-		void    (*__sa_handler)(int32);
-		void    (*__sa_sigaction)(int32, Siginfo*, void *);
-	} __sigaction_u;		/* signal handler */
-	uint32	sa_mask;		/* signal mask to apply */
+		void    (*_sa_handler)(int32);
+		void    (*_sa_sigaction)(int32, Siginfo*, void *);
+	} _sa_u;			/* signal handler */
+	uint32	sa_mask[4];		/* signal mask to apply */
 	int32	sa_flags;		/* see signal options below */
 } Sigaction;
 
 void
-runtime·dumpregs(Sigcontext *r)
+runtime·dumpregs(McontextT *mc)
 {
-	runtime·printf("eax     %x\n", r->sc_eax);
-	runtime·printf("ebx     %x\n", r->sc_ebx);
-	runtime·printf("ecx     %x\n", r->sc_ecx);
-	runtime·printf("edx     %x\n", r->sc_edx);
-	runtime·printf("edi     %x\n", r->sc_edi);
-	runtime·printf("esi     %x\n", r->sc_esi);
-	runtime·printf("ebp     %x\n", r->sc_ebp);
-	runtime·printf("esp     %x\n", r->sc_esp);
-	runtime·printf("eip     %x\n", r->sc_eip);
-	runtime·printf("eflags  %x\n", r->sc_eflags);
-	runtime·printf("cs      %x\n", r->sc_cs);
-	runtime·printf("fs      %x\n", r->sc_fs);
-	runtime·printf("gs      %x\n", r->sc_gs);
+	runtime·printf("eax     %x\n", mc->__gregs[REG_EAX]);
+	runtime·printf("ebx     %x\n", mc->__gregs[REG_EBX]);
+	runtime·printf("ecx     %x\n", mc->__gregs[REG_ECX]);
+	runtime·printf("edx     %x\n", mc->__gregs[REG_EDX]);
+	runtime·printf("edi     %x\n", mc->__gregs[REG_EDI]);
+	runtime·printf("esi     %x\n", mc->__gregs[REG_ESI]);
+	runtime·printf("ebp     %x\n", mc->__gregs[REG_EBP]);
+	runtime·printf("esp     %x\n", mc->__gregs[REG_UESP]);
+	runtime·printf("eip     %x\n", mc->__gregs[REG_EIP]);
+	runtime·printf("eflags  %x\n", mc->__gregs[REG_EFL]);
+	runtime·printf("cs      %x\n", mc->__gregs[REG_CS]);
+	runtime·printf("fs      %x\n", mc->__gregs[REG_FS]);
+	runtime·printf("gs      %x\n", mc->__gregs[REG_GS]);
 }
 
 void
 runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 {
-	Sigcontext *r = context;
+	UcontextT *uc = context;
+	McontextT *mc = &uc->uc_mcontext;
 	uintptr *sp;
 	SigTab *t;
 
 	if(sig == SIGPROF) {
-		runtime·sigprof((uint8*)r->sc_eip, (uint8*)r->sc_esp, nil, gp);
+		runtime·sigprof((uint8*)mc->__gregs[REG_EIP],
+			(uint8*)mc->__gregs[REG_UESP], nil, gp);
 		return;
 	}
 
 	t = &runtime·sigtab[sig];
-	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+	if(info->_code != SI_USER && (t->flags & SigPanic)) {
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
-		// Have to pass arguments out of band since
+		// We need to pass arguments out of band since
 		// augmenting the stack frame would break
 		// the unwinding code.
 		gp->sig = sig;
-		gp->sigcode0 = info->si_code;
-		gp->sigcode1 = *(uintptr*)((byte*)info + 12); /* si_addr */
-		gp->sigpc = r->sc_eip;
-
-		// Only push runtime·sigpanic if r->sc_eip != 0.
-		// If r->sc_eip == 0, probably panicked because of a
-		// call to a nil func.  Not pushing that onto sp will
-		// make the trace look like a call to runtime·sigpanic instead.
-		// (Otherwise the trace will end at runtime·sigpanic and we
-		// won't get to see who faulted.)
-		if(r->sc_eip != 0) {
-			sp = (uintptr*)r->sc_esp;
-			*--sp = r->sc_eip;
-			r->sc_esp = (uintptr)sp;
+		gp->sigcode0 = info->_code;
+		gp->sigcode1 = *(uintptr*)&info->_reason[0]; /* _addr */
+		gp->sigpc = mc->__gregs[REG_EIP];
+
+		// Only push runtime·sigpanic if __gregs[REG_EIP] != 0.
+		// If __gregs[REG_EIP] == 0, probably panicked because of a
+		// call to a nil func. Not pushing that onto sp will make the
+		// trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic
+		// and we won't get to see who faulted.)
+		if(mc->__gregs[REG_EIP] != 0) {
+			sp = (uintptr*)mc->__gregs[REG_UESP];
+			*--sp = mc->__gregs[REG_EIP];
+			mc->__gregs[REG_UESP] = (uintptr)sp;
 		}
-		r->sc_eip = (uintptr)runtime·sigpanic;
+		mc->__gregs[REG_EIP] = (uintptr)runtime·sigpanic;
 		return;
 	}
 
-	if(info->si_code == SI_USER || (t->flags & SigNotify))
+	if(info->_code == SI_USER || (t->flags & SigNotify))
 		if(runtime·sigsend(sig))
 			return;
 	if(t->flags & SigKill)
@@ -92,13 +95,18 @@ Throw:
 	else
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
-	runtime·printf("PC=%X\n", r->sc_eip);
+	runtime·printf("PC=%X\n", mc->__gregs[REG_EIP]);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
-		runtime·traceback((void*)r->sc_eip, (void*)r->sc_esp, 0, gp);
+		runtime·traceback((void*)mc->__gregs[REG_EIP],
+			(void*)mc->__gregs[REG_UESP], 0, gp);
 		runtime·tracebackothers(gp);
-		runtime·dumpregs(r);
+		runtime·dumpregs(mc);
 	}
 
 	runtime·exit(2);
@@ -109,9 +117,11 @@ runtime·signalstack(byte *p, int32 n)
 {
 	Sigaltstack st;
 
-	st.ss_sp = (int8*)p;
+	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -120,13 +130,35 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa._sa_u._sa_sigaction == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
 		sa.sa_flags |= SA_RESTART;
-	sa.sa_mask = ~0ULL;
+	sa.sa_mask[0] = ~0U;
+	sa.sa_mask[1] = ~0U;
+	sa.sa_mask[2] = ~0U;
+	sa.sa_mask[3] = ~0U;
 	if (fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
-	sa.__sigaction_u.__sa_sigaction = (void*)fn;
+	sa._sa_u._sa_sigaction = (void*)fn;
 	runtime·sigaction(i, &sa, nil);
 }
+
+void
+runtime·lwp_mcontext_init(McontextT *mc, void *stack, M *mp, G *gp, void (*fn)(void))
+{
+	mc->__gregs[REG_EIP] = (uint32)runtime·lwp_tramp;
+	mc->__gregs[REG_UESP] = (uint32)stack;
+	mc->__gregs[REG_EBX] = (uint32)mp;
+	mc->__gregs[REG_EDX] = (uint32)gp;
+	mc->__gregs[REG_ESI] = (uint32)fn;
+}
diff --git a/src/pkg/runtime/signal_netbsd_amd64.c b/src/pkg/runtime/signal_netbsd_amd64.c
index 8b4f624e7..46afb682b 100644
--- a/src/pkg/runtime/signal_netbsd_amd64.c
+++ b/src/pkg/runtime/signal_netbsd_amd64.c
@@ -7,85 +7,86 @@
 #include "signals_GOOS.h"
 #include "os_GOOS.h"
 
+extern void runtime·lwp_tramp(void);
 extern void runtime·sigtramp(void);
 
 typedef struct sigaction {
 	union {
-		void    (*__sa_handler)(int32);
-		void    (*__sa_sigaction)(int32, Siginfo*, void *);
-	} __sigaction_u;		/* signal handler */
-	uint32	sa_mask;		/* signal mask to apply */
+		void    (*_sa_handler)(int32);
+		void    (*_sa_sigaction)(int32, Siginfo*, void *);
+	} _sa_u;			/* signal handler */
+	uint32	sa_mask[4];		/* signal mask to apply */
 	int32	sa_flags;		/* see signal options below */
 } Sigaction;
 
 void
-runtime·dumpregs(Sigcontext *r)
+runtime·dumpregs(McontextT *mc)
 {
-	runtime·printf("rax     %X\n", r->sc_rax);
-	runtime·printf("rbx     %X\n", r->sc_rbx);
-	runtime·printf("rcx     %X\n", r->sc_rcx);
-	runtime·printf("rdx     %X\n", r->sc_rdx);
-	runtime·printf("rdi     %X\n", r->sc_rdi);
-	runtime·printf("rsi     %X\n", r->sc_rsi);
-	runtime·printf("rbp     %X\n", r->sc_rbp);
-	runtime·printf("rsp     %X\n", r->sc_rsp);
-	runtime·printf("r8      %X\n", r->sc_r8);
-	runtime·printf("r9      %X\n", r->sc_r9);
-	runtime·printf("r10     %X\n", r->sc_r10);
-	runtime·printf("r11     %X\n", r->sc_r11);
-	runtime·printf("r12     %X\n", r->sc_r12);
-	runtime·printf("r13     %X\n", r->sc_r13);
-	runtime·printf("r14     %X\n", r->sc_r14);
-	runtime·printf("r15     %X\n", r->sc_r15);
-	runtime·printf("rip     %X\n", r->sc_rip);
-	runtime·printf("rflags  %X\n", r->sc_rflags);
-	runtime·printf("cs      %X\n", r->sc_cs);
-	runtime·printf("fs      %X\n", r->sc_fs);
-	runtime·printf("gs      %X\n", r->sc_gs);
+	runtime·printf("rax     %X\n", mc->__gregs[REG_RAX]);
+	runtime·printf("rbx     %X\n", mc->__gregs[REG_RBX]);
+	runtime·printf("rcx     %X\n", mc->__gregs[REG_RCX]);
+	runtime·printf("rdx     %X\n", mc->__gregs[REG_RDX]);
+	runtime·printf("rdi     %X\n", mc->__gregs[REG_RDI]);
+	runtime·printf("rsi     %X\n", mc->__gregs[REG_RSI]);
+	runtime·printf("rbp     %X\n", mc->__gregs[REG_RBP]);
+	runtime·printf("rsp     %X\n", mc->__gregs[REG_RSP]);
+	runtime·printf("r8      %X\n", mc->__gregs[REG_R8]);
+	runtime·printf("r9      %X\n", mc->__gregs[REG_R9]);
+	runtime·printf("r10     %X\n", mc->__gregs[REG_R10]);
+	runtime·printf("r11     %X\n", mc->__gregs[REG_R11]);
+	runtime·printf("r12     %X\n", mc->__gregs[REG_R12]);
+	runtime·printf("r13     %X\n", mc->__gregs[REG_R13]);
+	runtime·printf("r14     %X\n", mc->__gregs[REG_R14]);
+	runtime·printf("r15     %X\n", mc->__gregs[REG_R15]);
+	runtime·printf("rip     %X\n", mc->__gregs[REG_RIP]);
+	runtime·printf("rflags  %X\n", mc->__gregs[REG_RFLAGS]);
+	runtime·printf("cs      %X\n", mc->__gregs[REG_CS]);
+	runtime·printf("fs      %X\n", mc->__gregs[REG_FS]);
+	runtime·printf("gs      %X\n", mc->__gregs[REG_GS]);
 }
 
 void
 runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 {
-	Sigcontext *r = context;
+	UcontextT *uc = context;
+	McontextT *mc = &uc->uc_mcontext;
 	uintptr *sp;
 	SigTab *t;
 
 	if(sig == SIGPROF) {
-		runtime·sigprof((uint8*)r->sc_rip,
-			(uint8*)r->sc_rsp, nil, gp);
+		runtime·sigprof((uint8*)mc->__gregs[REG_RIP],
+			(uint8*)mc->__gregs[REG_RSP], nil, gp);
 		return;
 	}
 
 	t = &runtime·sigtab[sig];
-	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+	if(info->_code != SI_USER && (t->flags & SigPanic)) {
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
-		// Have to pass arguments out of band since
-		// augmenting the stack frame would break
-		// the unwinding code.
+		// We need to pass arguments out of band since augmenting the
+		// stack frame would break the unwinding code.
 		gp->sig = sig;
-		gp->sigcode0 = info->si_code;
-		gp->sigcode1 = *(uintptr*)((byte*)info + 16); /* si_addr */
-		gp->sigpc = r->sc_rip;
-
-		// Only push runtime·sigpanic if r->mc_rip != 0.
-		// If r->mc_rip == 0, probably panicked because of a
-		// call to a nil func.  Not pushing that onto sp will
-		// make the trace look like a call to runtime·sigpanic instead.
-		// (Otherwise the trace will end at runtime·sigpanic and we
-		// won't get to see who faulted.)
-		if(r->sc_rip != 0) {
-			sp = (uintptr*)r->sc_rsp;
-			*--sp = r->sc_rip;
-			r->sc_rsp = (uintptr)sp;
+		gp->sigcode0 = info->_code;
+		gp->sigcode1 = *(uintptr*)&info->_reason[0]; /* _addr */
+		gp->sigpc = mc->__gregs[REG_RIP];
+
+		// Only push runtime·sigpanic if __gregs[REG_RIP] != 0.
+		// If __gregs[REG_RIP] == 0, probably panicked because of a
+		// call to a nil func. Not pushing that onto sp will make the
+		// trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic
+		// and we won't get to see who faulted.)
+		if(mc->__gregs[REG_RIP] != 0) {
+			sp = (uintptr*)mc->__gregs[REG_RSP];
+			*--sp = mc->__gregs[REG_RIP];
+			mc->__gregs[REG_RSP] = (uintptr)sp;
 		}
-		r->sc_rip = (uintptr)runtime·sigpanic;
+		mc->__gregs[REG_RIP] = (uintptr)runtime·sigpanic;
 		return;
 	}
 
-	if(info->si_code == SI_USER || (t->flags & SigNotify))
+	if(info->_code == SI_USER || (t->flags & SigNotify))
 		if(runtime·sigsend(sig))
 			return;
 	if(t->flags & SigKill)
@@ -101,13 +102,18 @@ Throw:
 	else
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
-	runtime·printf("PC=%X\n", r->sc_rip);
+	runtime·printf("PC=%X\n", mc->__gregs[REG_RIP]);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
-		runtime·traceback((void*)r->sc_rip, (void*)r->sc_rsp, 0, gp);
+		runtime·traceback((void*)mc->__gregs[REG_RIP],
+			(void*)mc->__gregs[REG_RSP], 0, gp);
 		runtime·tracebackothers(gp);
-		runtime·dumpregs(r);
+		runtime·dumpregs(mc);
 	}
 
 	runtime·exit(2);
@@ -118,9 +124,11 @@ runtime·signalstack(byte *p, int32 n)
 {
 	Sigaltstack st;
 
-	st.ss_sp = (int8*)p;
+	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -129,13 +137,36 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa._sa_u._sa_sigaction == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
 		sa.sa_flags |= SA_RESTART;
-	sa.sa_mask = ~0ULL;
+	sa.sa_mask[0] = ~0U;
+	sa.sa_mask[1] = ~0U;
+	sa.sa_mask[2] = ~0U;
+	sa.sa_mask[3] = ~0U;
 	if (fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
-	sa.__sigaction_u.__sa_sigaction = (void*)fn;
+	sa._sa_u._sa_sigaction = (void*)fn;
 	runtime·sigaction(i, &sa, nil);
 }
+
+void
+runtime·lwp_mcontext_init(McontextT *mc, void *stack, M *mp, G *gp, void (*fn)(void))
+{
+	// Machine dependent mcontext initialisation for LWP.
+	mc->__gregs[REG_RIP] = (uint64)runtime·lwp_tramp;
+	mc->__gregs[REG_RSP] = (uint64)stack;
+	mc->__gregs[REG_R8] = (uint64)mp;
+	mc->__gregs[REG_R9] = (uint64)gp;
+	mc->__gregs[REG_R12] = (uint64)fn;
+}
diff --git a/src/pkg/runtime/signal_netbsd_arm.c b/src/pkg/runtime/signal_netbsd_arm.c
new file mode 100644
index 000000000..97f62687b
--- /dev/null
+++ b/src/pkg/runtime/signal_netbsd_arm.c
@@ -0,0 +1,208 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "signals_GOOS.h"
+#include "os_GOOS.h"
+
+#define r0	__gregs[0]
+#define r1	__gregs[1]
+#define r2	__gregs[2]
+#define r3	__gregs[3]
+#define r4	__gregs[4]
+#define r5	__gregs[5]
+#define r6	__gregs[6]
+#define r7	__gregs[7]
+#define r8	__gregs[8]
+#define r9	__gregs[9]
+#define r10	__gregs[10]
+#define r11	__gregs[11]
+#define r12	__gregs[12]
+#define r13	__gregs[13]
+#define r14	__gregs[14]
+#define r15	__gregs[15]
+#define cpsr	__gregs[16]
+
+void
+runtime·dumpregs(McontextT *r)
+{
+	runtime·printf("r0      %x\n", r->r0);
+	runtime·printf("r1      %x\n", r->r1);
+	runtime·printf("r2      %x\n", r->r2);
+	runtime·printf("r3      %x\n", r->r3);
+	runtime·printf("r4      %x\n", r->r4);
+	runtime·printf("r5      %x\n", r->r5);
+	runtime·printf("r6      %x\n", r->r6);
+	runtime·printf("r7      %x\n", r->r7);
+	runtime·printf("r8      %x\n", r->r8);
+	runtime·printf("r9      %x\n", r->r9);
+	runtime·printf("r10     %x\n", r->r10);
+	runtime·printf("fp      %x\n", r->r11);
+	runtime·printf("ip      %x\n", r->r12);
+	runtime·printf("sp      %x\n", r->r13);
+	runtime·printf("lr      %x\n", r->r14);
+	runtime·printf("pc      %x\n", r->r15);
+	runtime·printf("cpsr    %x\n", r->cpsr);
+}
+
+extern void runtime·lwp_tramp(void);
+extern void runtime·sigtramp(void);
+
+typedef struct sigaction {
+	union {
+		void    (*_sa_handler)(int32);
+		void    (*_sa_sigaction)(int32, Siginfo*, void *);
+	} _sa_u;			/* signal handler */
+	uint32	sa_mask[4];		/* signal mask to apply */
+	int32	sa_flags;		/* see signal options below */
+} Sigaction;
+
+void
+runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
+{
+	UcontextT *uc;
+	McontextT *r;
+	SigTab *t;
+
+	uc = context;
+	r = &uc->uc_mcontext;
+
+	if(sig == SIGPROF) {
+		runtime·sigprof((uint8*)r->r15, (uint8*)r->r13, (uint8*)r->r14, gp);
+		return;
+	}
+
+	t = &runtime·sigtab[sig];
+	if(info->_code != SI_USER && (t->flags & SigPanic)) {
+		if(gp == nil || gp == m->g0)
+			goto Throw;
+		// Make it look like a call to the signal func.
+		// We have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp->sig = sig;
+		gp->sigcode0 = info->_code;
+		gp->sigcode1 = *(uintptr*)&info->_reason[0]; /* _addr */
+		gp->sigpc = r->r15;
+
+		// We arrange lr, and pc to pretend the panicking
+		// function calls sigpanic directly.
+		// Always save LR to stack so that panics in leaf
+		// functions are correctly handled. This smashes
+		// the stack frame but we're not going back there
+		// anyway.
+		r->r13 -= 4;
+		*(uint32 *)r->r13 = r->r14;
+		// Don't bother saving PC if it's zero, which is
+		// probably a call to a nil func: the old link register
+		// is more useful in the stack trace.
+		if(r->r15 != 0)
+			r->r14 = r->r15;
+		// In case we are panicking from external C code
+		r->r10 = (uintptr)gp;
+		r->r9 = (uintptr)m;
+		r->r15 = (uintptr)runtime·sigpanic;
+		return;
+	}
+
+	if(info->_code == SI_USER || (t->flags & SigNotify))
+		if(runtime·sigsend(sig))
+			return;
+	if(t->flags & SigKill)
+		runtime·exit(2);
+	if(!(t->flags & SigThrow))
+		return;
+
+Throw:
+	runtime·startpanic();
+
+	if(sig < 0 || sig >= NSIG)
+		runtime·printf("Signal %d\n", sig);
+	else
+		runtime·printf("%s\n", runtime·sigtab[sig].name);
+
+	runtime·printf("PC=%x\n", r->r15);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
+	runtime·printf("\n");
+
+	if(runtime·gotraceback()){
+		runtime·traceback((void*)r->r15, (void*)r->r13, (void*)r->r14, gp);
+		runtime·tracebackothers(gp);
+		runtime·printf("\n");
+		runtime·dumpregs(r);
+	}
+
+//	breakpoint();
+	runtime·exit(2);
+}
+
+void
+runtime·signalstack(byte *p, int32 n)
+{
+	Sigaltstack st;
+
+	st.ss_sp = (uint8*)p;
+	st.ss_size = n;
+	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
+	runtime·sigaltstack(&st, nil);
+}
+
+void
+runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
+{
+	Sigaction sa;
+
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa._sa_u._sa_sigaction == SIG_IGN)
+			return;
+	}
+
+	runtime·memclr((byte*)&sa, sizeof sa);
+	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
+	if(restart)
+		sa.sa_flags |= SA_RESTART;
+	sa.sa_mask[0] = ~0U;
+	sa.sa_mask[1] = ~0U;
+	sa.sa_mask[2] = ~0U;
+	sa.sa_mask[3] = ~0U;
+	if (fn == runtime·sighandler)
+		fn = (void*)runtime·sigtramp;
+	sa._sa_u._sa_sigaction = (void*)fn;
+	runtime·sigaction(i, &sa, nil);
+}
+
+void
+runtime·lwp_mcontext_init(McontextT *mc, void *stack, M *mp, G *gp, void (*fn)(void))
+{
+	mc->r15 = (uint32)runtime·lwp_tramp;
+	mc->r13 = (uint32)stack;
+	mc->r0 = (uint32)mp;
+	mc->r1 = (uint32)gp;
+	mc->r2 = (uint32)fn;
+}
+
+void
+runtime·checkgoarm(void)
+{
+	// TODO(minux)
+}
+
+#pragma textflag 7
+int64
+runtime·cputicks() {
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// TODO: need more entropy to better seed fastrand1.
+	return runtime·nanotime();
+}
diff --git a/src/pkg/runtime/signal_openbsd_386.c b/src/pkg/runtime/signal_openbsd_386.c
index 39d829484..516797c8d 100644
--- a/src/pkg/runtime/signal_openbsd_386.c
+++ b/src/pkg/runtime/signal_openbsd_386.c
@@ -50,7 +50,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -93,6 +93,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", r->sc_eip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -109,9 +113,11 @@ runtime·signalstack(byte *p, int32 n)
 {
 	Sigaltstack st;
 
-	st.ss_sp = (int8*)p;
+	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -120,6 +126,15 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa.__sigaction_u.__sa_sigaction == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
diff --git a/src/pkg/runtime/signal_openbsd_amd64.c b/src/pkg/runtime/signal_openbsd_amd64.c
index 8b4f624e7..0d0db770b 100644
--- a/src/pkg/runtime/signal_openbsd_amd64.c
+++ b/src/pkg/runtime/signal_openbsd_amd64.c
@@ -59,7 +59,7 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 
 	t = &runtime·sigtab[sig];
 	if(info->si_code != SI_USER && (t->flags & SigPanic)) {
-		if(gp == nil)
+		if(gp == nil || gp == m->g0)
 			goto Throw;
 		// Make it look like a call to the signal func.
 		// Have to pass arguments out of band since
@@ -70,8 +70,8 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 		gp->sigcode1 = *(uintptr*)((byte*)info + 16); /* si_addr */
 		gp->sigpc = r->sc_rip;
 
-		// Only push runtime·sigpanic if r->mc_rip != 0.
-		// If r->mc_rip == 0, probably panicked because of a
+		// Only push runtime·sigpanic if r->sc_rip != 0.
+		// If r->sc_rip == 0, probably panicked because of a
 		// call to a nil func.  Not pushing that onto sp will
 		// make the trace look like a call to runtime·sigpanic instead.
 		// (Otherwise the trace will end at runtime·sigpanic and we
@@ -102,6 +102,10 @@ Throw:
 		runtime·printf("%s\n", runtime·sigtab[sig].name);
 
 	runtime·printf("PC=%X\n", r->sc_rip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
@@ -118,9 +122,11 @@ runtime·signalstack(byte *p, int32 n)
 {
 	Sigaltstack st;
 
-	st.ss_sp = (int8*)p;
+	st.ss_sp = p;
 	st.ss_size = n;
 	st.ss_flags = 0;
+	if(p == nil)
+		st.ss_flags = SS_DISABLE;
 	runtime·sigaltstack(&st, nil);
 }
 
@@ -129,12 +135,21 @@ runtime·setsig(int32 i, void (*fn)(int32, Siginfo*, void*, G*), bool restart)
 {
 	Sigaction sa;
 
+	// If SIGHUP handler is SIG_IGN, assume running
+	// under nohup and do not set explicit handler.
+	if(i == SIGHUP) {
+		runtime·memclr((byte*)&sa, sizeof sa);
+		runtime·sigaction(i, nil, &sa);
+		if(sa.__sigaction_u.__sa_sigaction == SIG_IGN)
+			return;
+	}
+
 	runtime·memclr((byte*)&sa, sizeof sa);
 	sa.sa_flags = SA_SIGINFO|SA_ONSTACK;
 	if(restart)
 		sa.sa_flags |= SA_RESTART;
-	sa.sa_mask = ~0ULL;
-	if (fn == runtime·sighandler)
+	sa.sa_mask = ~0U;
+	if(fn == runtime·sighandler)
 		fn = (void*)runtime·sigtramp;
 	sa.__sigaction_u.__sa_sigaction = (void*)fn;
 	runtime·sigaction(i, &sa, nil);
diff --git a/src/pkg/runtime/signal_plan9_386.c b/src/pkg/runtime/signal_plan9_386.c
index d26688516..17bc11749 100644
--- a/src/pkg/runtime/signal_plan9_386.c
+++ b/src/pkg/runtime/signal_plan9_386.c
@@ -3,6 +3,107 @@
 // license that can be found in the LICENSE file. 
 
 #include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Ureg *u)
+{
+	runtime·printf("ax	%X\n", u->ax);
+	runtime·printf("bx	%X\n", u->bx);
+	runtime·printf("cx	%X\n", u->cx);
+	runtime·printf("dx	%X\n", u->dx);
+	runtime·printf("di	%X\n", u->di);
+	runtime·printf("si	%X\n", u->si);
+	runtime·printf("bp	%X\n", u->bp);
+	runtime·printf("sp	%X\n", u->sp);
+	runtime·printf("pc	%X\n", u->pc);
+	runtime·printf("flags	%X\n", u->flags);
+	runtime·printf("cs	%X\n", u->cs);
+	runtime·printf("fs	%X\n", u->fs);
+	runtime·printf("gs	%X\n", u->gs);
+}
+
+int32
+runtime·sighandler(void *v, int8 *s, G *gp)
+{
+	Ureg *ureg;
+	uintptr *sp;
+	SigTab *sig, *nsig;
+	int32 len, i;
+
+	if(!s)
+		return NCONT;
+			
+	len = runtime·findnull((byte*)s);
+	if(len <= 4 || runtime·mcmp((byte*)s, (byte*)"sys:", 4) != 0)
+		return NDFLT;
+
+	nsig = nil;
+	sig = runtime·sigtab;
+	for(i=0; i < NSIG; i++) {
+		if(runtime·strstr((byte*)s, (byte*)sig->name)) {
+			nsig = sig;
+			break;
+		}
+		sig++;
+	}
+
+	if(nsig == nil)
+		return NDFLT;
+
+	ureg = v;
+	if(nsig->flags & SigPanic) {
+		if(gp == nil || m->notesig == 0)
+			goto Throw;
+
+		// Save error string from sigtramp's stack,
+		// into gsignal->sigcode0, so we can reliably
+		// access it from the panic routines.
+		if(len > ERRMAX)
+			len = ERRMAX;
+		runtime·memmove((void*)m->notesig, (void*)s, len);
+
+		gp->sig = i;
+		gp->sigpc = ureg->pc;
+
+		// Only push runtime·sigpanic if ureg->pc != 0.
+		// If ureg->pc == 0, probably panicked because of a
+		// call to a nil func.  Not pushing that onto sp will
+		// make the trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic and we
+		// won't get to see who faulted.)
+		if(ureg->pc != 0) {
+			sp = (uintptr*)ureg->sp;
+			*--sp = ureg->pc;
+			ureg->sp = (uint32)sp;
+		}
+		ureg->pc = (uintptr)runtime·sigpanic;
+		return NCONT;
+	}
+
+	if(!(nsig->flags & SigThrow))
+		return NDFLT;
+
+Throw:
+	runtime·startpanic();
+
+	runtime·printf("%s\n", s);
+	runtime·printf("PC=%X\n", ureg->pc);
+	runtime·printf("\n");
+
+	if(runtime·gotraceback()) {
+		runtime·traceback((void*)ureg->pc, (void*)ureg->sp, 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·dumpregs(ureg);
+	}
+	runtime·goexitsall("");
+	runtime·exits(s);
+
+	return 0;
+}
+
 
 void
 runtime·sigenable(uint32 sig)
diff --git a/src/pkg/runtime/signal_plan9_amd64.c b/src/pkg/runtime/signal_plan9_amd64.c
new file mode 100644
index 000000000..e4f946abc
--- /dev/null
+++ b/src/pkg/runtime/signal_plan9_amd64.c
@@ -0,0 +1,127 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file. 
+
+#include "runtime.h"
+#include "defs_GOOS_GOARCH.h"
+#include "os_GOOS.h"
+#include "signals_GOOS.h"
+
+void
+runtime·dumpregs(Ureg *u)
+{
+	runtime·printf("ax	%X\n", u->ax);
+	runtime·printf("bx	%X\n", u->bx);
+	runtime·printf("cx	%X\n", u->cx);
+	runtime·printf("dx	%X\n", u->dx);
+	runtime·printf("di	%X\n", u->di);
+	runtime·printf("si	%X\n", u->si);
+	runtime·printf("bp	%X\n", u->bp);
+	runtime·printf("sp	%X\n", u->sp);
+	runtime·printf("r8	%X\n", u->r8);
+	runtime·printf("r9	%X\n", u->r9);
+	runtime·printf("r10	%X\n", u->r10);
+	runtime·printf("r11	%X\n", u->r11);
+	runtime·printf("r12	%X\n", u->r12);
+	runtime·printf("r13	%X\n", u->r13);
+	runtime·printf("r14	%X\n", u->r14);
+	runtime·printf("r15	%X\n", u->r15);
+	runtime·printf("ip	%X\n", u->ip);
+	runtime·printf("flags	%X\n", u->flags);
+	runtime·printf("cs	%X\n", (uint64)u->cs);
+	runtime·printf("fs	%X\n", (uint64)u->fs);
+	runtime·printf("gs	%X\n", (uint64)u->gs);
+}
+
+int32
+runtime·sighandler(void *v, int8 *s, G *gp)
+{
+	Ureg *ureg;
+	uintptr *sp;
+	SigTab *sig, *nsig;
+	int32 len, i;
+
+	if(!s)
+		return NCONT;
+			
+	len = runtime·findnull((byte*)s);
+	if(len <= 4 || runtime·mcmp((byte*)s, (byte*)"sys:", 4) != 0)
+		return NDFLT;
+
+	nsig = nil;
+	sig = runtime·sigtab;
+	for(i=0; i < NSIG; i++) {
+		if(runtime·strstr((byte*)s, (byte*)sig->name)) {
+			nsig = sig;
+			break;
+		}
+		sig++;
+	}
+
+	if(nsig == nil)
+		return NDFLT;
+
+	ureg = v;
+	if(nsig->flags & SigPanic) {
+		if(gp == nil || m->notesig == 0)
+			goto Throw;
+
+		// Save error string from sigtramp's stack,
+		// into gsignal->sigcode0, so we can reliably
+		// access it from the panic routines.
+		if(len > ERRMAX)
+			len = ERRMAX;
+		runtime·memmove((void*)m->notesig, (void*)s, len);
+
+		gp->sig = i;
+		gp->sigpc = ureg->ip;
+
+		// Only push runtime·sigpanic if ureg->ip != 0.
+		// If ureg->ip == 0, probably panicked because of a
+		// call to a nil func.  Not pushing that onto sp will
+		// make the trace look like a call to runtime·sigpanic instead.
+		// (Otherwise the trace will end at runtime·sigpanic and we
+		// won't get to see who faulted.)
+		if(ureg->ip != 0) {
+			sp = (uintptr*)ureg->sp;
+			*--sp = ureg->ip;
+			ureg->sp = (uint64)sp;
+		}
+		ureg->ip = (uintptr)runtime·sigpanic;
+		return NCONT;
+	}
+
+	if(!(nsig->flags & SigThrow))
+		return NDFLT;
+
+Throw:
+	runtime·startpanic();
+
+	runtime·printf("%s\n", s);
+	runtime·printf("PC=%X\n", ureg->ip);
+	runtime·printf("\n");
+
+	if(runtime·gotraceback()) {
+		runtime·traceback((void*)ureg->ip, (void*)ureg->sp, 0, gp);
+		runtime·tracebackothers(gp);
+		runtime·dumpregs(ureg);
+	}
+	runtime·goexitsall("");
+	runtime·exits(s);
+
+	return 0;
+}
+
+void
+runtime·sigenable(uint32 sig)
+{
+	USED(sig);
+}
+
+void
+runtime·resetcpuprofiler(int32 hz)
+{
+	// TODO: Enable profiling interrupts.
+	
+	m->profilehz = hz;
+}
diff --git a/src/pkg/runtime/signal_windows_386.c b/src/pkg/runtime/signal_windows_386.c
index a248374db..d76d5bf4b 100644
--- a/src/pkg/runtime/signal_windows_386.c
+++ b/src/pkg/runtime/signal_windows_386.c
@@ -68,11 +68,15 @@ runtime·sighandler(ExceptionRecord *info, Context *r, G *gp)
 		info->ExceptionInformation[0], info->ExceptionInformation[1]);
 
 	runtime·printf("PC=%x\n", r->Eip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
-		runtime·traceback((void*)r->Eip, (void*)r->Esp, 0, m->curg);
-		runtime·tracebackothers(m->curg);
+		runtime·traceback((void*)r->Eip, (void*)r->Esp, 0, gp);
+		runtime·tracebackothers(gp);
 		runtime·dumpregs(r);
 	}
 
diff --git a/src/pkg/runtime/signal_windows_amd64.c b/src/pkg/runtime/signal_windows_amd64.c
index 1cdf1cac4..3729aa57b 100644
--- a/src/pkg/runtime/signal_windows_amd64.c
+++ b/src/pkg/runtime/signal_windows_amd64.c
@@ -75,6 +75,10 @@ runtime·sighandler(ExceptionRecord *info, Context *r, G *gp)
 		info->ExceptionInformation[0], info->ExceptionInformation[1]);
 
 	runtime·printf("PC=%X\n", r->Rip);
+	if(m->lockedg != nil && m->ncgo > 0 && gp == m->g0) {
+		runtime·printf("signal arrived during cgo execution\n");
+		gp = m->lockedg;
+	}
 	runtime·printf("\n");
 
 	if(runtime·gotraceback()){
diff --git a/src/pkg/runtime/signals_linux.h b/src/pkg/runtime/signals_linux.h
index 345a6c5d1..9c3567007 100644
--- a/src/pkg/runtime/signals_linux.h
+++ b/src/pkg/runtime/signals_linux.h
@@ -42,7 +42,7 @@ SigTab runtime·sigtab[] = {
 	/* 30 */	N, "SIGPWR: power failure restart",
 	/* 31 */	N, "SIGSYS: bad system call",
 	/* 32 */	N, "signal 32",
-	/* 33 */	N, "signal 33",
+	/* 33 */	0, "signal 33", /* SIGSETXID; see issue 3871 */
 	/* 34 */	N, "signal 34",
 	/* 35 */	N, "signal 35",
 	/* 36 */	N, "signal 36",
diff --git a/src/pkg/runtime/signals_netbsd.h b/src/pkg/runtime/signals_netbsd.h
index 4d27e050d..7140de86f 100644
--- a/src/pkg/runtime/signals_netbsd.h
+++ b/src/pkg/runtime/signals_netbsd.h
@@ -9,16 +9,16 @@
 #define D SigDefault
 
 SigTab runtime·sigtab[] = {
-	/* 0 */	0, "SIGNONE: no trap",
-	/* 1 */	N+K, "SIGHUP: terminal line hangup",
-	/* 2 */	N+K, "SIGINT: interrupt",
-	/* 3 */	N+T, "SIGQUIT: quit",
-	/* 4 */	T, "SIGILL: illegal instruction",
-	/* 5 */	T, "SIGTRAP: trace trap",
-	/* 6 */	N+T, "SIGABRT: abort",
-	/* 7 */	T, "SIGEMT: emulate instruction executed",
-	/* 8 */	P, "SIGFPE: floating-point exception",
-	/* 9 */	0, "SIGKILL: kill",
+	/*  0 */	0, "SIGNONE: no trap",
+	/*  1 */	N+K, "SIGHUP: terminal line hangup",
+	/*  2 */	N+K, "SIGINT: interrupt",
+	/*  3 */	N+T, "SIGQUIT: quit",
+	/*  4 */	T, "SIGILL: illegal instruction",
+	/*  5 */	T, "SIGTRAP: trace trap",
+	/*  6 */	N+T, "SIGABRT: abort",
+	/*  7 */	T, "SIGEMT: emulate instruction executed",
+	/*  8 */	P, "SIGFPE: floating-point exception",
+	/*  9 */	0, "SIGKILL: kill",
 	/* 10 */	P, "SIGBUS: bus error",
 	/* 11 */	P, "SIGSEGV: segmentation violation",
 	/* 12 */	T, "SIGSYS: bad system call",
diff --git a/src/pkg/runtime/signals_openbsd.h b/src/pkg/runtime/signals_openbsd.h
index 4d27e050d..7140de86f 100644
--- a/src/pkg/runtime/signals_openbsd.h
+++ b/src/pkg/runtime/signals_openbsd.h
@@ -9,16 +9,16 @@
 #define D SigDefault
 
 SigTab runtime·sigtab[] = {
-	/* 0 */	0, "SIGNONE: no trap",
-	/* 1 */	N+K, "SIGHUP: terminal line hangup",
-	/* 2 */	N+K, "SIGINT: interrupt",
-	/* 3 */	N+T, "SIGQUIT: quit",
-	/* 4 */	T, "SIGILL: illegal instruction",
-	/* 5 */	T, "SIGTRAP: trace trap",
-	/* 6 */	N+T, "SIGABRT: abort",
-	/* 7 */	T, "SIGEMT: emulate instruction executed",
-	/* 8 */	P, "SIGFPE: floating-point exception",
-	/* 9 */	0, "SIGKILL: kill",
+	/*  0 */	0, "SIGNONE: no trap",
+	/*  1 */	N+K, "SIGHUP: terminal line hangup",
+	/*  2 */	N+K, "SIGINT: interrupt",
+	/*  3 */	N+T, "SIGQUIT: quit",
+	/*  4 */	T, "SIGILL: illegal instruction",
+	/*  5 */	T, "SIGTRAP: trace trap",
+	/*  6 */	N+T, "SIGABRT: abort",
+	/*  7 */	T, "SIGEMT: emulate instruction executed",
+	/*  8 */	P, "SIGFPE: floating-point exception",
+	/*  9 */	0, "SIGKILL: kill",
 	/* 10 */	P, "SIGBUS: bus error",
 	/* 11 */	P, "SIGSEGV: segmentation violation",
 	/* 12 */	T, "SIGSYS: bad system call",
diff --git a/src/pkg/runtime/signals_plan9.h b/src/pkg/runtime/signals_plan9.h
index 5df757613..0f1165e2a 100644
--- a/src/pkg/runtime/signals_plan9.h
+++ b/src/pkg/runtime/signals_plan9.h
@@ -1 +1,24 @@
-// nothing to see here
+#define N SigNotify
+#define T SigThrow
+#define P SigPanic
+
+SigTab runtime·sigtab[] = {
+	P, "sys: fp:",
+
+	// Go libraries expect to be able
+	// to recover from memory
+	// read/write errors, so we flag
+	// those as panics. All other traps
+	// are generally more serious and
+	// should immediately throw an
+	// exception.
+	P, "sys: trap: fault read addr",
+	P, "sys: trap: fault write addr",
+	T, "sys: trap:",
+
+	N, "sys: bad sys call",
+};
+
+#undef N
+#undef T
+#undef P
diff --git a/src/pkg/runtime/sigqueue.goc b/src/pkg/runtime/sigqueue.goc
index b49fdba86..ab5f312e4 100644
--- a/src/pkg/runtime/sigqueue.goc
+++ b/src/pkg/runtime/sigqueue.goc
@@ -5,36 +5,24 @@
 // This file implements runtime support for signal handling.
 //
 // Most synchronization primitives are not available from
-// the signal handler (it cannot block and cannot use locks)
+// the signal handler (it cannot block, allocate memory, or use locks)
 // so the handler communicates with a processing goroutine
 // via struct sig, below.
 //
-// Ownership for sig.Note passes back and forth between
-// the signal handler and the signal goroutine in rounds.
-// The initial state is that sig.note is cleared (setup by signal_enable).
-// At the beginning of each round, mask == 0.
-// The round goes through three stages:
-//
-// (In parallel)
-// 1a) One or more signals arrive and are handled
-// by sigsend using cas to set bits in sig.mask.
-// The handler that changes sig.mask from zero to non-zero
-// calls notewakeup(&sig).
-// 1b) Sigrecv calls notesleep(&sig) to wait for the wakeup.
-//
-// 2) Having received the wakeup, sigrecv knows that sigsend
-// will not send another wakeup, so it can noteclear(&sig)
-// to prepare for the next round. (Sigsend may still be adding
-// signals to sig.mask at this point, which is fine.)
-//
-// 3) Sigrecv uses cas to grab the current sig.mask and zero it,
-// triggering the next round.
-//
-// The signal handler takes ownership of the note by atomically
-// changing mask from a zero to non-zero value. It gives up
-// ownership by calling notewakeup. The signal goroutine takes
-// ownership by returning from notesleep (caused by the notewakeup)
-// and gives up ownership by clearing mask.
+// sigsend() is called by the signal handler to queue a new signal.
+// signal_recv() is called by the Go program to receive a newly queued signal.
+// Synchronization between sigsend() and signal_recv() is based on the sig.state
+// variable.  It can be in 3 states: 0, HASWAITER and HASSIGNAL.
+// HASWAITER means that signal_recv() is blocked on sig.Note and there are no
+// new pending signals.
+// HASSIGNAL means that sig.mask *may* contain new pending signals,
+// signal_recv() can't be blocked in this state.
+// 0 means that there are no new pending signals and signal_recv() is not blocked.
+// Transitions between states are done atomically with CAS.
+// When signal_recv() is unblocked, it resets sig.Note and rechecks sig.mask.
+// If several sigsend()'s and signal_recv() execute concurrently, it can lead to
+// unnecessary rechecks of sig.mask, but must not lead to missed signals
+// nor deadlocks.
 
 package runtime
 #include "runtime.h"
@@ -45,15 +33,20 @@ static struct {
 	Note;
 	uint32 mask[(NSIG+31)/32];
 	uint32 wanted[(NSIG+31)/32];
-	uint32 kick;
+	uint32 state;
 	bool inuse;
 } sig;
 
+enum {
+	HASWAITER = 1,
+	HASSIGNAL = 2,
+};
+
 // Called from sighandler to send a signal back out of the signal handling thread.
 bool
 runtime·sigsend(int32 s)
 {
-	uint32 bit, mask;
+	uint32 bit, mask, old, new;
 
 	if(!sig.inuse || s < 0 || s >= 32*nelem(sig.wanted) || !(sig.wanted[s/32]&(1U<<(s&31))))
 		return false;
@@ -65,8 +58,20 @@ runtime·sigsend(int32 s)
 		if(runtime·cas(&sig.mask[s/32], mask, mask|bit)) {
 			// Added to queue.
 			// Only send a wakeup if the receiver needs a kick.
-			if(runtime·cas(&sig.kick, 1, 0))
-				runtime·notewakeup(&sig);
+			for(;;) {
+				old = runtime·atomicload(&sig.state);
+				if(old == HASSIGNAL)
+					break;
+				if(old == HASWAITER)
+					new = 0;
+				else  // if(old == 0)
+					new = HASSIGNAL;
+				if(runtime·cas(&sig.state, old, new)) {
+					if (old == HASWAITER)
+						runtime·notewakeup(&sig);
+					break;
+				}
+			}
 			break;
 		}
 	}
@@ -77,7 +82,7 @@ runtime·sigsend(int32 s)
 // Must only be called from a single goroutine at a time.
 func signal_recv() (m uint32) {
 	static uint32 recv[nelem(sig.mask)];
-	int32 i, more;
+	uint32 i, old, new;
 	
 	for(;;) {
 		// Serve from local copy if there are bits left.
@@ -89,15 +94,27 @@ func signal_recv() (m uint32) {
 			}
 		}
 
-		// Get a new local copy.
-		// Ask for a kick if more signals come in
-		// during or after our check (before the sleep).
-		if(sig.kick == 0) {
-			runtime·noteclear(&sig);
-			runtime·cas(&sig.kick, 0, 1);
+		// Check and update sig.state.
+		for(;;) {
+			old = runtime·atomicload(&sig.state);
+			if(old == HASWAITER)
+				runtime·throw("inconsistent state in signal_recv");
+			if(old == HASSIGNAL)
+				new = 0;
+			else  // if(old == 0)
+				new = HASWAITER;
+			if(runtime·cas(&sig.state, old, new)) {
+				if (new == HASWAITER) {
+					runtime·entersyscallblock();
+					runtime·notesleep(&sig);
+					runtime·exitsyscall();
+					runtime·noteclear(&sig);
+				}
+				break;
+			}
 		}
 
-		more = 0;
+		// Get a new local copy.
 		for(i=0; i<nelem(sig.mask); i++) {
 			for(;;) {
 				m = sig.mask[i];
@@ -105,16 +122,7 @@ func signal_recv() (m uint32) {
 					break;
 			}
 			recv[i] = m;
-			if(m != 0)
-				more = 1;
 		}
-		if(more)
-			continue;
-
-		// Sleep waiting for more.
-		runtime·entersyscall();
-		runtime·notesleep(&sig);
-		runtime·exitsyscall();
 	}
 
 done:;
diff --git a/src/pkg/runtime/slice.c b/src/pkg/runtime/slice.c
index e5726c93b..354c54c86 100644
--- a/src/pkg/runtime/slice.c
+++ b/src/pkg/runtime/slice.c
@@ -5,22 +5,30 @@
 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "type.h"
+#include "typekind.h"
 #include "malloc.h"
+#include "race.h"
 
-static	int32	debug	= 0;
+static	bool	debug	= 0;
 
-static	void	makeslice1(SliceType*, int32, int32, Slice*);
-static	void	growslice1(SliceType*, Slice, int32, Slice *);
-	void	runtime·copy(Slice to, Slice fm, uintptr width, int32 ret);
+static	void	makeslice1(SliceType*, intgo, intgo, Slice*);
+static	void	growslice1(SliceType*, Slice, intgo, Slice *);
+	void	runtime·copy(Slice to, Slice fm, uintptr width, intgo ret);
 
 // see also unsafe·NewArray
 // makeslice(typ *Type, len, cap int64) (ary []any);
 void
 runtime·makeslice(SliceType *t, int64 len, int64 cap, Slice ret)
 {
-	if(len < 0 || (int32)len != len)
+	// NOTE: The len > MaxMem/elemsize check here is not strictly necessary,
+	// but it produces a 'len out of range' error instead of a 'cap out of range' error
+	// when someone does make([]T, bignumber). 'cap out of range' is true too,
+	// but since the cap is only being supplied implicitly, saying len is clearer.
+	// See issue 4085.
+	if(len < 0 || (intgo)len != len || t->elem->size > 0 && len > MaxMem / t->elem->size)
 		runtime·panicstring("makeslice: len out of range");
-	if(cap < len || (int32)cap != cap || t->elem->size > 0 && cap > ((uintptr)-1) / t->elem->size)
+
+	if(cap < len || (intgo)cap != cap || t->elem->size > 0 && cap > MaxMem / t->elem->size)
 		runtime·panicstring("makeslice: cap out of range");
 
 	makeslice1(t, len, cap, &ret);
@@ -35,10 +43,10 @@ runtime·makeslice(SliceType *t, int64 len, int64 cap, Slice ret)
 // Dummy word to use as base pointer for make([]T, 0).
 // Since you cannot take the address of such a slice,
 // you can't tell that they all have the same base pointer.
-static uintptr zerobase;
+uintptr runtime·zerobase;
 
 static void
-makeslice1(SliceType *t, int32 len, int32 cap, Slice *ret)
+makeslice1(SliceType *t, intgo len, intgo cap, Slice *ret)
 {
 	uintptr size;
 
@@ -47,22 +55,34 @@ makeslice1(SliceType *t, int32 len, int32 cap, Slice *ret)
 	ret->len = len;
 	ret->cap = cap;
 
-	if(cap == 0)
-		ret->array = (byte*)&zerobase;
+	if(size == 0)
+		ret->array = (byte*)&runtime·zerobase;
 	else if((t->elem->kind&KindNoPointers))
 		ret->array = runtime·mallocgc(size, FlagNoPointers, 1, 1);
-	else
-		ret->array = runtime·mal(size);
+	else {
+		ret->array = runtime·mallocgc(size, 0, 1, 1);
+
+		if(UseSpanType) {
+			if(false) {
+				runtime·printf("new slice [%D]%S: %p\n", (int64)cap, *t->elem->string, ret->array);
+			}
+			runtime·settype(ret->array, (uintptr)t->elem | TypeInfo_Array);
+		}
+	}
 }
 
 // appendslice(type *Type, x, y, []T) []T
+#pragma textflag 7
 void
 runtime·appendslice(SliceType *t, Slice x, Slice y, Slice ret)
 {
-	int32 m;
+	intgo m;
 	uintptr w;
+	void *pc;
+	uint8 *p, *q;
 
 	m = x.len+y.len;
+	w = t->elem->size;
 
 	if(m < x.len)
 		runtime·throw("append: slice overflow");
@@ -72,30 +92,83 @@ runtime·appendslice(SliceType *t, Slice x, Slice y, Slice ret)
 	else
 		ret = x;
 
-	w = t->elem->size;
-	runtime·memmove(ret.array + ret.len*w, y.array, y.len*w);
+	if(raceenabled) {
+		// Don't mark read/writes on the newly allocated slice.
+		pc = runtime·getcallerpc(&t);
+		// read x[:len]
+		if(m > x.cap)
+			runtime·racereadrangepc(x.array, x.len*w, w, pc, runtime·appendslice);
+		// read y
+		runtime·racereadrangepc(y.array, y.len*w, w, pc, runtime·appendslice);
+		// write x[len(x):len(x)+len(y)]
+		if(m <= x.cap)
+			runtime·racewriterangepc(ret.array+ret.len*w, y.len*w, w, pc, runtime·appendslice);
+	}
+
+	// A very common case is appending bytes. Small appends can avoid the overhead of memmove.
+	// We can generalize a bit here, and just pick small-sized appends.
+	p = ret.array+ret.len*w;
+	q = y.array;
+	w *= y.len;
+	if(w <= appendCrossover) {
+		if(p <= q || w <= p-q) // No overlap.
+			while(w-- > 0)
+				*p++ = *q++;
+		else {
+			p += w;
+			q += w;
+			while(w-- > 0)
+				*--p = *--q;
+		}
+	} else {
+		runtime·memmove(p, q, w);
+	}
 	ret.len += y.len;
 	FLUSH(&ret);
 }
 
 
 // appendstr([]byte, string) []byte
+#pragma textflag 7
 void
 runtime·appendstr(SliceType *t, Slice x, String y, Slice ret)
 {
-	int32 m;
+	intgo m;
+	void *pc;
+	uintptr w;
+	uint8 *p, *q;
 
 	m = x.len+y.len;
 
 	if(m < x.len)
-		runtime·throw("append: slice overflow");
+		runtime·throw("append: string overflow");
 
 	if(m > x.cap)
 		growslice1(t, x, m, &ret);
 	else
 		ret = x;
 
-	runtime·memmove(ret.array + ret.len, y.str, y.len);
+	if(raceenabled) {
+		// Don't mark read/writes on the newly allocated slice.
+		pc = runtime·getcallerpc(&t);
+		// read x[:len]
+		if(m > x.cap)
+			runtime·racereadrangepc(x.array, x.len, 1, pc, runtime·appendstr);
+		// write x[len(x):len(x)+len(y)]
+		if(m <= x.cap)
+			runtime·racewriterangepc(ret.array+ret.len, y.len, 1, pc, runtime·appendstr);
+	}
+
+	// Small appends can avoid the overhead of memmove.
+	w = y.len;
+	p = ret.array+ret.len;
+	q = y.str;
+	if(w <= appendCrossover) {
+		while(w-- > 0)
+			*p++ = *q++;
+	} else {
+		runtime·memmove(p, q, w);
+	}
 	ret.len += y.len;
 	FLUSH(&ret);
 }
@@ -106,15 +179,21 @@ void
 runtime·growslice(SliceType *t, Slice old, int64 n, Slice ret)
 {
 	int64 cap;
+	void *pc;
 
 	if(n < 1)
 		runtime·panicstring("growslice: invalid n");
 
 	cap = old.cap + n;
 
-	if((int32)cap != cap || cap > ((uintptr)-1) / t->elem->size)
+	if((intgo)cap != cap || cap < old.cap || (t->elem->size > 0 && cap > MaxMem/t->elem->size))
 		runtime·panicstring("growslice: cap out of range");
 
+	if(raceenabled) {
+		pc = runtime·getcallerpc(&t);
+		runtime·racereadrangepc(old.array, old.len*t->elem->size, t->elem->size, pc, runtime·growslice);
+	}
+
 	growslice1(t, old, cap, &ret);
 
 	FLUSH(&ret);
@@ -128,12 +207,17 @@ runtime·growslice(SliceType *t, Slice old, int64 n, Slice ret)
 }
 
 static void
-growslice1(SliceType *t, Slice x, int32 newcap, Slice *ret)
+growslice1(SliceType *t, Slice x, intgo newcap, Slice *ret)
 {
-	int32 m;
+	intgo m;
 
 	m = x.cap;
-	if(m == 0)
+	
+	// Using newcap directly for m+m < newcap handles
+	// both the case where m == 0 and also the case where
+	// m+m/4 wraps around, in which case the loop
+	// below might never terminate.
+	if(m+m < newcap)
 		m = newcap;
 	else {
 		do {
@@ -147,153 +231,13 @@ growslice1(SliceType *t, Slice x, int32 newcap, Slice *ret)
 	runtime·memmove(ret->array, x.array, ret->len * t->elem->size);
 }
 
-// sliceslice(old []any, lb uint64, hb uint64, width uint64) (ary []any);
-void
-runtime·sliceslice(Slice old, uint64 lb, uint64 hb, uint64 width, Slice ret)
-{
-	if(hb > old.cap || lb > hb) {
-		if(debug) {
-			runtime·prints("runtime.sliceslice: old=");
-			runtime·printslice(old);
-			runtime·prints("; lb=");
-			runtime·printint(lb);
-			runtime·prints("; hb=");
-			runtime·printint(hb);
-			runtime·prints("; width=");
-			runtime·printint(width);
-			runtime·prints("\n");
-
-			runtime·prints("oldarray: nel=");
-			runtime·printint(old.len);
-			runtime·prints("; cap=");
-			runtime·printint(old.cap);
-			runtime·prints("\n");
-		}
-		runtime·panicslice();
-	}
-
-	// new array is inside old array
-	ret.len = hb - lb;
-	ret.cap = old.cap - lb;
-	ret.array = old.array + lb*width;
-
-	FLUSH(&ret);
-
-	if(debug) {
-		runtime·prints("runtime.sliceslice: old=");
-		runtime·printslice(old);
-		runtime·prints("; lb=");
-		runtime·printint(lb);
-		runtime·prints("; hb=");
-		runtime·printint(hb);
-		runtime·prints("; width=");
-		runtime·printint(width);
-		runtime·prints("; ret=");
-		runtime·printslice(ret);
-		runtime·prints("\n");
-	}
-}
-
-// sliceslice1(old []any, lb uint64, width uint64) (ary []any);
-void
-runtime·sliceslice1(Slice old, uint64 lb, uint64 width, Slice ret)
-{
-	if(lb > old.len) {
-		if(debug) {
-			runtime·prints("runtime.sliceslice: old=");
-			runtime·printslice(old);
-			runtime·prints("; lb=");
-			runtime·printint(lb);
-			runtime·prints("; width=");
-			runtime·printint(width);
-			runtime·prints("\n");
-
-			runtime·prints("oldarray: nel=");
-			runtime·printint(old.len);
-			runtime·prints("; cap=");
-			runtime·printint(old.cap);
-			runtime·prints("\n");
-		}
-		runtime·panicslice();
-	}
-
-	// new array is inside old array
-	ret.len = old.len - lb;
-	ret.cap = old.cap - lb;
-	ret.array = old.array + lb*width;
-
-	FLUSH(&ret);
-
-	if(debug) {
-		runtime·prints("runtime.sliceslice: old=");
-		runtime·printslice(old);
-		runtime·prints("; lb=");
-		runtime·printint(lb);
-		runtime·prints("; width=");
-		runtime·printint(width);
-		runtime·prints("; ret=");
-		runtime·printslice(ret);
-		runtime·prints("\n");
-	}
-}
-
-// slicearray(old *any, nel uint64, lb uint64, hb uint64, width uint64) (ary []any);
+// copy(to any, fr any, wid uintptr) int
+#pragma textflag 7
 void
-runtime·slicearray(byte* old, uint64 nel, uint64 lb, uint64 hb, uint64 width, Slice ret)
+runtime·copy(Slice to, Slice fm, uintptr width, intgo ret)
 {
-	if(nel > 0 && old == nil) {
-		// crash if old == nil.
-		// could give a better message
-		// but this is consistent with all the in-line checks
-		// that the compiler inserts for other uses.
-		*old = 0;
-	}
-
-	if(hb > nel || lb > hb) {
-		if(debug) {
-			runtime·prints("runtime.slicearray: old=");
-			runtime·printpointer(old);
-			runtime·prints("; nel=");
-			runtime·printint(nel);
-			runtime·prints("; lb=");
-			runtime·printint(lb);
-			runtime·prints("; hb=");
-			runtime·printint(hb);
-			runtime·prints("; width=");
-			runtime·printint(width);
-			runtime·prints("\n");
-		}
-		runtime·panicslice();
-	}
-
-	// new array is inside old array
-	ret.len = hb-lb;
-	ret.cap = nel-lb;
-	ret.array = old + lb*width;
-
-	FLUSH(&ret);
-
-	if(debug) {
-		runtime·prints("runtime.slicearray: old=");
-		runtime·printpointer(old);
-		runtime·prints("; nel=");
-		runtime·printint(nel);
-		runtime·prints("; lb=");
-		runtime·printint(lb);
-		runtime·prints("; hb=");
-		runtime·printint(hb);
-		runtime·prints("; width=");
-		runtime·printint(width);
-		runtime·prints("; ret=");
-		runtime·printslice(ret);
-		runtime·prints("\n");
-	}
-}
+	void *pc;
 
-// copy(to any, fr any, wid uint32) int
-void
-runtime·copy(Slice to, Slice fm, uintptr width, int32 ret)
-{
 	if(fm.len == 0 || to.len == 0 || width == 0) {
 		ret = 0;
 		goto out;
@@ -303,6 +247,12 @@ runtime·copy(Slice to, Slice fm, uintptr width, int32 ret)
 	if(to.len < ret)
 		ret = to.len;
 
+	if(raceenabled) {
+		pc = runtime·getcallerpc(&to);
+		runtime·racewriterangepc(to.array, ret*width, width, pc, runtime·copy);
+		runtime·racereadrangepc(fm.array, ret*width, width, pc, runtime·copy);
+	}
+
 	if(ret == 1 && width == 1) {	// common case worth about 2x to do here
 		*to.array = *fm.array;	// known to be a byte pointer
 	} else {
@@ -325,9 +275,12 @@ out:
 	}
 }
 
+#pragma textflag 7
 void
-runtime·slicestringcopy(Slice to, String fm, int32 ret)
+runtime·slicestringcopy(Slice to, String fm, intgo ret)
 {
+	void *pc;
+
 	if(fm.len == 0 || to.len == 0) {
 		ret = 0;
 		goto out;
@@ -337,6 +290,11 @@ runtime·slicestringcopy(Slice to, String fm, int32 ret)
 	if(to.len < ret)
 		ret = to.len;
 
+	if(raceenabled) {
+		pc = runtime·getcallerpc(&to);
+		runtime·racewriterangepc(to.array, ret, 1, pc, runtime·slicestringcopy);
+	}
+
 	runtime·memmove(to.array, fm.str, ret);
 
 out:
diff --git a/src/pkg/runtime/softfloat_arm.c b/src/pkg/runtime/softfloat_arm.c
index bd73cb15b..9a5440630 100644
--- a/src/pkg/runtime/softfloat_arm.c
+++ b/src/pkg/runtime/softfloat_arm.c
@@ -420,6 +420,23 @@ stage3:	// regd, regm are 4bit variables
 				regd, regm, m->freghi[regd], m->freglo[regd]);
 		break;
 
+	case 0xeeb00bc0:	// D[regd] = abs D[regm]
+		m->freglo[regd] = m->freglo[regm];
+		m->freghi[regd] = m->freghi[regm] & ((1<<31)-1);
+
+		if(trace)
+			runtime·printf("*** D[%d] = abs D[%d] %x-%x\n",
+					regd, regm, m->freghi[regd], m->freglo[regd]);
+		break;
+
+	case 0xeeb00ac0:	// F[regd] = abs F[regm]
+		m->freglo[regd] = m->freglo[regm] & ((1<<31)-1);
+
+		if(trace)
+			runtime·printf("*** F[%d] = abs F[%d] %x\n",
+					regd, regm, m->freglo[regd]);
+		break;
+
 	case 0xeeb40bc0:	// D[regd] :: D[regm] (CMPD)
 		runtime·fcmp64c(getd(regd), getd(regm), &cmp, &nan);
 		m->fflag = fstatus(nan, cmp);
diff --git a/src/pkg/runtime/stack.c b/src/pkg/runtime/stack.c
new file mode 100644
index 000000000..e9a35672f
--- /dev/null
+++ b/src/pkg/runtime/stack.c
@@ -0,0 +1,282 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "arch_GOARCH.h"
+#include "malloc.h"
+#include "stack.h"
+
+typedef struct StackCacheNode StackCacheNode;
+struct StackCacheNode
+{
+	StackCacheNode *next;
+	void*	batch[StackCacheBatch-1];
+};
+
+static StackCacheNode *stackcache;
+static Lock stackcachemu;
+
+// stackcacherefill/stackcacherelease implement a global cache of stack segments.
+// The cache is required to prevent unlimited growth of per-thread caches.
+static void
+stackcacherefill(void)
+{
+	StackCacheNode *n;
+	int32 i, pos;
+
+	runtime·lock(&stackcachemu);
+	n = stackcache;
+	if(n)
+		stackcache = n->next;
+	runtime·unlock(&stackcachemu);
+	if(n == nil) {
+		n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch);
+		if(n == nil)
+			runtime·throw("out of memory (stackcacherefill)");
+		runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
+		for(i = 0; i < StackCacheBatch-1; i++)
+			n->batch[i] = (byte*)n + (i+1)*FixedStack;
+	}
+	pos = m->stackcachepos;
+	for(i = 0; i < StackCacheBatch-1; i++) {
+		m->stackcache[pos] = n->batch[i];
+		pos = (pos + 1) % StackCacheSize;
+	}
+	m->stackcache[pos] = n;
+	pos = (pos + 1) % StackCacheSize;
+	m->stackcachepos = pos;
+	m->stackcachecnt += StackCacheBatch;
+}
+
+static void
+stackcacherelease(void)
+{
+	StackCacheNode *n;
+	uint32 i, pos;
+
+	pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
+	n = (StackCacheNode*)m->stackcache[pos];
+	pos = (pos + 1) % StackCacheSize;
+	for(i = 0; i < StackCacheBatch-1; i++) {
+		n->batch[i] = m->stackcache[pos];
+		pos = (pos + 1) % StackCacheSize;
+	}
+	m->stackcachecnt -= StackCacheBatch;
+	runtime·lock(&stackcachemu);
+	n->next = stackcache;
+	stackcache = n;
+	runtime·unlock(&stackcachemu);
+}
+
+void*
+runtime·stackalloc(uint32 n)
+{
+	uint32 pos;
+	void *v;
+
+	// Stackalloc must be called on scheduler stack, so that we
+	// never try to grow the stack during the code that stackalloc runs.
+	// Doing so would cause a deadlock (issue 1547).
+	if(g != m->g0)
+		runtime·throw("stackalloc not on scheduler stack");
+
+	// Stack allocator uses malloc/free most of the time,
+	// but if we're in the middle of malloc and need stack,
+	// we have to do something else to avoid deadlock.
+	// In that case, we fall back on a fixed-size free-list
+	// allocator, assuming that inside malloc all the stack
+	// frames are small, so that all the stack allocations
+	// will be a single size, the minimum (right now, 5k).
+	if(n == FixedStack || m->mallocing || m->gcing) {
+		if(n != FixedStack) {
+			runtime·printf("stackalloc: in malloc, size=%d want %d\n", FixedStack, n);
+			runtime·throw("stackalloc");
+		}
+		if(m->stackcachecnt == 0)
+			stackcacherefill();
+		pos = m->stackcachepos;
+		pos = (pos - 1) % StackCacheSize;
+		v = m->stackcache[pos];
+		m->stackcachepos = pos;
+		m->stackcachecnt--;
+		m->stackinuse++;
+		return v;
+	}
+	return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
+}
+
+void
+runtime·stackfree(void *v, uintptr n)
+{
+	uint32 pos;
+
+	if(n == FixedStack || m->mallocing || m->gcing) {
+		if(m->stackcachecnt == StackCacheSize)
+			stackcacherelease();
+		pos = m->stackcachepos;
+		m->stackcache[pos] = v;
+		m->stackcachepos = (pos + 1) % StackCacheSize;
+		m->stackcachecnt++;
+		m->stackinuse--;
+		return;
+	}
+	runtime·free(v);
+}
+
+// Called from runtime·lessstack when returning from a function which
+// allocated a new stack segment.  The function's return value is in
+// m->cret.
+void
+runtime·oldstack(void)
+{
+	Stktop *top;
+	Gobuf label;
+	uint32 argsize;
+	uintptr cret;
+	byte *sp, *old;
+	uintptr *src, *dst, *dstend;
+	G *gp;
+	int64 goid;
+
+//printf("oldstack m->cret=%p\n", m->cret);
+
+	gp = m->curg;
+	top = (Stktop*)gp->stackbase;
+	old = (byte*)gp->stackguard - StackGuard;
+	sp = (byte*)top;
+	argsize = top->argsize;
+	if(argsize > 0) {
+		sp -= argsize;
+		dst = (uintptr*)top->argp;
+		dstend = dst + argsize/sizeof(*dst);
+		src = (uintptr*)sp;
+		while(dst < dstend)
+			*dst++ = *src++;
+	}
+	goid = top->gobuf.g->goid;	// fault if g is bad, before gogo
+	USED(goid);
+
+	label = top->gobuf;
+	gp->stackbase = (uintptr)top->stackbase;
+	gp->stackguard = (uintptr)top->stackguard;
+	if(top->free != 0)
+		runtime·stackfree(old, top->free);
+
+	cret = m->cret;
+	m->cret = 0;  // drop reference
+	runtime·gogo(&label, cret);
+}
+
+// Called from reflect·call or from runtime·morestack when a new
+// stack segment is needed.  Allocate a new stack big enough for
+// m->moreframesize bytes, copy m->moreargsize bytes to the new frame,
+// and then act as though runtime·lessstack called the function at
+// m->morepc.
+void
+runtime·newstack(void)
+{
+	int32 framesize, minalloc, argsize;
+	Stktop *top;
+	byte *stk, *sp;
+	uintptr *src, *dst, *dstend;
+	G *gp;
+	Gobuf label;
+	bool reflectcall;
+	uintptr free;
+
+	framesize = m->moreframesize;
+	argsize = m->moreargsize;
+	gp = m->curg;
+
+	if(m->morebuf.sp < gp->stackguard - StackGuard) {
+		runtime·printf("runtime: split stack overflow: %p < %p\n", m->morebuf.sp, gp->stackguard - StackGuard);
+		runtime·throw("runtime: split stack overflow");
+	}
+	if(argsize % sizeof(uintptr) != 0) {
+		runtime·printf("runtime: stack split with misaligned argsize %d\n", argsize);
+		runtime·throw("runtime: stack split argsize");
+	}
+
+	minalloc = 0;
+	reflectcall = framesize==1;
+	if(reflectcall) {
+		framesize = 0;
+		// moreframesize_minalloc is only set in runtime·gc(),
+		// that calls newstack via reflect·call().
+		minalloc = m->moreframesize_minalloc;
+		m->moreframesize_minalloc = 0;
+		if(framesize < minalloc)
+			framesize = minalloc;
+	}
+
+	if(reflectcall && minalloc == 0 && m->morebuf.sp - sizeof(Stktop) - argsize - 32 > gp->stackguard) {
+		// special case: called from reflect.call (framesize==1)
+		// to call code with an arbitrary argument size,
+		// and we have enough space on the current stack.
+		// the new Stktop* is necessary to unwind, but
+		// we don't need to create a new segment.
+		top = (Stktop*)(m->morebuf.sp - sizeof(*top));
+		stk = (byte*)gp->stackguard - StackGuard;
+		free = 0;
+	} else {
+		// allocate new segment.
+		framesize += argsize;
+		framesize += StackExtra;	// room for more functions, Stktop.
+		if(framesize < StackMin)
+			framesize = StackMin;
+		framesize += StackSystem;
+		stk = runtime·stackalloc(framesize);
+		top = (Stktop*)(stk+framesize-sizeof(*top));
+		free = framesize;
+	}
+
+	if(0) {
+		runtime·printf("newstack framesize=%d argsize=%d morepc=%p moreargp=%p gobuf=%p, %p top=%p old=%p\n",
+			framesize, argsize, m->morepc, m->moreargp, m->morebuf.pc, m->morebuf.sp, top, gp->stackbase);
+	}
+
+	top->stackbase = (byte*)gp->stackbase;
+	top->stackguard = (byte*)gp->stackguard;
+	top->gobuf = m->morebuf;
+	top->argp = m->moreargp;
+	top->argsize = argsize;
+	top->free = free;
+	m->moreargp = nil;
+	m->morebuf.pc = nil;
+	m->morebuf.sp = (uintptr)nil;
+
+	// copy flag from panic
+	top->panic = gp->ispanic;
+	gp->ispanic = false;
+
+	gp->stackbase = (uintptr)top;
+	gp->stackguard = (uintptr)stk + StackGuard;
+
+	sp = (byte*)top;
+	if(argsize > 0) {
+		sp -= argsize;
+		dst = (uintptr*)sp;
+		dstend = dst + argsize/sizeof(*dst);
+		src = (uintptr*)top->argp;
+		while(dst < dstend)
+			*dst++ = *src++;
+	}
+	if(thechar == '5') {
+		// caller would have saved its LR below args.
+		sp -= sizeof(void*);
+		*(void**)sp = nil;
+	}
+
+	// Continue as if lessstack had just called m->morepc
+	// (the PC that decided to grow the stack).
+	label.sp = (uintptr)sp;
+	label.pc = (byte*)runtime·lessstack;
+	label.g = m->curg;
+	if(reflectcall)
+		runtime·gogocallfn(&label, (FuncVal*)m->morepc);
+	else
+		runtime·gogocall(&label, m->morepc, m->cret);
+
+	*(int32*)345 = 123;	// never return
+}
diff --git a/src/pkg/runtime/stack.h b/src/pkg/runtime/stack.h
index d42385d6c..06b0c568c 100644
--- a/src/pkg/runtime/stack.h
+++ b/src/pkg/runtime/stack.h
@@ -55,13 +55,19 @@ functions to make sure that this limit cannot be violated.
 enum {
 	// StackSystem is a number of additional bytes to add
 	// to each stack below the usual guard area for OS-specific
-	// purposes like signal handling. Used on Windows because
-	// it does not use a separate stack.
+	// purposes like signal handling. Used on Windows and on
+	// Plan 9 because they do not use a separate stack.
 #ifdef GOOS_windows
 	StackSystem = 512 * sizeof(uintptr),
 #else
+#ifdef GOOS_plan9
+	// The size of the note handler frame varies among architectures,
+	// but 512 bytes should be enough for every implementation.
+	StackSystem = 512,
+#else
 	StackSystem = 0,
-#endif
+#endif	// Plan 9
+#endif	// Windows
 
 	// The amount of extra stack to allocate beyond the size
 	// needed for the single frame that triggered the split.
diff --git a/src/pkg/runtime/stack_test.go b/src/pkg/runtime/stack_test.go
index 01c6b6e2f..759f7c46e 100644
--- a/src/pkg/runtime/stack_test.go
+++ b/src/pkg/runtime/stack_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Test stack split logic by calling functions of every frame size 
+// Test stack split logic by calling functions of every frame size
 // from near 0 up to and beyond the default segment size (4k).
 // Each of those functions reports its SP + stack limit, and then
 // the test (the caller) checks that those make sense.  By not
@@ -34,6 +34,7 @@ package runtime_test
 import (
 	. "runtime"
 	"testing"
+	"time"
 	"unsafe"
 )
 
@@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St
 func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
 func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
 func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
+
+// TestStackMem measures per-thread stack segment cache behavior.
+// The test consumed up to 500MB in the past.
+func TestStackMem(t *testing.T) {
+	const (
+		BatchSize      = 32
+		BatchCount     = 512
+		ArraySize      = 1024
+		RecursionDepth = 128
+	)
+	if testing.Short() {
+		return
+	}
+	defer GOMAXPROCS(GOMAXPROCS(BatchSize))
+	s0 := new(MemStats)
+	ReadMemStats(s0)
+	for b := 0; b < BatchCount; b++ {
+		c := make(chan bool, BatchSize)
+		for i := 0; i < BatchSize; i++ {
+			go func() {
+				var f func(k int, a [ArraySize]byte)
+				f = func(k int, a [ArraySize]byte) {
+					if k == 0 {
+						time.Sleep(time.Millisecond)
+						return
+					}
+					f(k-1, a)
+				}
+				f(RecursionDepth, [ArraySize]byte{})
+				c <- true
+			}()
+		}
+		for i := 0; i < BatchSize; i++ {
+			<-c
+		}
+	}
+	s1 := new(MemStats)
+	ReadMemStats(s1)
+	consumed := s1.StackSys - s0.StackSys
+	t.Logf("Consumed %vMB for stack mem", consumed>>20)
+	estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
+	if consumed > estimate {
+		t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
+	}
+	if s1.StackInuse > 4<<20 {
+		t.Fatalf("Stack inuse: want %v, got %v", 4<<20, s1.StackInuse)
+	}
+}
diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc
index 7e1f8a1e8..c0d3f2bde 100644
--- a/src/pkg/runtime/string.goc
+++ b/src/pkg/runtime/string.goc
@@ -6,6 +6,7 @@ package runtime
 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
+#include "race.h"
 
 String	runtime·emptystring;
 
@@ -33,10 +34,10 @@ runtime·findnullw(uint16 *s)
 	return l;
 }
 
-uint32 runtime·maxstring = 256;
+uint32 runtime·maxstring = 256; // a hint for print
 
 static String
-gostringsize(int32 l)
+gostringsize(intgo l)
 {
 	String s;
 	uint32 ms;
@@ -58,7 +59,7 @@ gostringsize(int32 l)
 String
 runtime·gostring(byte *str)
 {
-	int32 l;
+	intgo l;
 	String s;
 
 	l = runtime·findnull(str);
@@ -68,7 +69,7 @@ runtime·gostring(byte *str)
 }
 
 String
-runtime·gostringn(byte *str, int32 l)
+runtime·gostringn(byte *str, intgo l)
 {
 	String s;
 
@@ -78,7 +79,7 @@ runtime·gostringn(byte *str, int32 l)
 }
 
 Slice
-runtime·gobytes(byte *p, int32 n)
+runtime·gobytes(byte *p, intgo n)
 {
 	Slice sl;
 
@@ -102,7 +103,7 @@ runtime·gostringnocopy(byte *str)
 String
 runtime·gostringw(uint16 *str)
 {
-	int32 n1, n2, i;
+	intgo n1, n2, i;
 	byte buf[8];
 	String s;
 
@@ -139,17 +140,26 @@ runtime·catstring(String s1, String s2)
 }
 
 static String
-concatstring(int32 n, String *s)
+concatstring(intgo n, String *s)
 {
-	int32 i, l;
+	intgo i, l, count;
 	String out;
 
 	l = 0;
+	count = 0;
 	for(i=0; i<n; i++) {
 		if(l + s[i].len < l)
 			runtime·throw("string concatenation too long");
 		l += s[i].len;
+		if(s[i].len > 0) {
+			count++;
+			out = s[i];
+		}
 	}
+	if(count == 0)
+		return runtime·emptystring;
+	if(count == 1) // zero or one non-empty string in concatenation
+		return out;
 	
 	out = gostringsize(l);
 	l = 0;
@@ -163,14 +173,14 @@ concatstring(int32 n, String *s)
 #pragma textflag 7
 // s1 is the first of n strings.
 // the output string follows.
-func concatstring(n int32, s1 String) {
+func concatstring(n int, s1 String) {
 	(&s1)[n] = concatstring(n, &s1);
 }
 
 static int32
 cmpstring(String s1, String s2)
 {
-	uint32 i, l;
+	uintgo i, l;
 	byte c1, c2;
 
 	l = s1.len;
@@ -191,14 +201,34 @@ cmpstring(String s1, String s2)
 	return 0;
 }
 
-func cmpstring(s1 String, s2 String) (v int32) {
+func cmpstring(s1 String, s2 String) (v int) {
 	v = cmpstring(s1, s2);
 }
 
+func eqstring(s1 String, s2 String) (v bool) {
+	uintgo i, l;
+
+	if(s1.len != s2.len) {
+		v = false;
+		return;
+	}
+	if(s1.str == s2.str) {
+		v = true;
+		return;
+	}
+	l = s1.len;
+	for(i=0; i<l; i++)
+		if(s1.str[i] != s2.str[i]) {
+			v = false;
+			return;
+		}
+	v = true;
+}
+
 int32
 runtime·strcmp(byte *s1, byte *s2)
 {
-	uint32 i;
+	uintptr i;
 	byte c1, c2;
 
 	for(i=0;; i++) {
@@ -235,31 +265,6 @@ runtime·strstr(byte *s1, byte *s2)
 	return nil;
 }
 
-func slicestring(si String, lindex int32, hindex int32) (so String) {
-	int32 l;
-
-	if(lindex < 0 || lindex > si.len ||
-	   hindex < lindex || hindex > si.len) {
-	   	runtime·panicslice();
-	}
-
-	l = hindex-lindex;
-	so.str = si.str + lindex;
-	so.len = l;
-}
-
-func slicestring1(si String, lindex int32) (so String) {
-	int32 l;
-
-	if(lindex < 0 || lindex > si.len) {
-		runtime·panicslice();
-	}
-
-	l = si.len-lindex;
-	so.str = si.str + lindex;
-	so.len = l;
-}
-
 func intstring(v int64) (s String) {
 	s = gostringsize(8);
 	s.len = runtime·runetochar(s.str, v);
@@ -267,6 +272,12 @@ func intstring(v int64) (s String) {
 }
 
 func slicebytetostring(b Slice) (s String) {
+	void *pc;
+
+	if(raceenabled) {
+		pc = runtime·getcallerpc(&b);
+		runtime·racereadrangepc(b.array, b.len, 1, pc, runtime·slicebytetostring);
+	}
 	s = gostringsize(b.len);
 	runtime·memmove(s.str, b.array, s.len);
 }
@@ -279,10 +290,15 @@ func stringtoslicebyte(s String) (b Slice) {
 }
 
 func slicerunetostring(b Slice) (s String) {
-	int32 siz1, siz2, i;
+	intgo siz1, siz2, i;
 	int32 *a;
 	byte dum[8];
+	void *pc;
 
+	if(raceenabled) {
+		pc = runtime·getcallerpc(&b);
+		runtime·racereadrangepc(b.array, b.len*sizeof(*a), sizeof(*a), pc, runtime·slicerunetostring);
+	}
 	a = (int32*)b.array;
 	siz1 = 0;
 	for(i=0; i<b.len; i++) {
@@ -302,7 +318,7 @@ func slicerunetostring(b Slice) (s String) {
 }
 
 func stringtoslicerune(s String) (b Slice) {
-	int32 n;
+	intgo n;
 	int32 dum, *r;
 	uint8 *p, *ep;
 
@@ -330,7 +346,7 @@ enum
 	Runeself	= 0x80,
 };
 
-func stringiter(s String, k int32) (retk int32) {
+func stringiter(s String, k int) (retk int) {
 	int32 l;
 
 	if(k >= s.len) {
@@ -351,7 +367,7 @@ func stringiter(s String, k int32) (retk int32) {
 out:
 }
 
-func stringiter2(s String, k int32) (retk int32, retv int32) {
+func stringiter2(s String, k int) (retk int, retv int32) {
 	if(k >= s.len) {
 		// retk=0 is end of iteration
 		retk = 0;
diff --git a/src/pkg/runtime/string_test.go b/src/pkg/runtime/string_test.go
new file mode 100644
index 000000000..8f13f0f42
--- /dev/null
+++ b/src/pkg/runtime/string_test.go
@@ -0,0 +1,45 @@
+package runtime_test
+
+import (
+	"testing"
+)
+
+func BenchmarkCompareStringEqual(b *testing.B) {
+	bytes := []byte("Hello Gophers!")
+	s1, s2 := string(bytes), string(bytes)
+	for i := 0; i < b.N; i++ {
+		if s1 != s2 {
+			b.Fatal("s1 != s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringIdentical(b *testing.B) {
+	s1 := "Hello Gophers!"
+	s2 := s1
+	for i := 0; i < b.N; i++ {
+		if s1 != s2 {
+			b.Fatal("s1 != s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringSameLength(b *testing.B) {
+	s1 := "Hello Gophers!"
+	s2 := "Hello, Gophers"
+	for i := 0; i < b.N; i++ {
+		if s1 == s2 {
+			b.Fatal("s1 == s2")
+		}
+	}
+}
+
+func BenchmarkCompareStringDifferentLength(b *testing.B) {
+	s1 := "Hello Gophers!"
+	s2 := "Hello, Gophers!"
+	for i := 0; i < b.N; i++ {
+		if s1 == s2 {
+			b.Fatal("s1 == s2")
+		}
+	}
+}
diff --git a/src/pkg/runtime/symtab.c b/src/pkg/runtime/symtab.c
index f29276bd7..d7221c476 100644
--- a/src/pkg/runtime/symtab.c
+++ b/src/pkg/runtime/symtab.c
@@ -2,20 +2,59 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Runtime symbol table access.  Work in progress.
-// The Plan 9 symbol table is not in a particularly convenient form.
-// The routines here massage it into a more usable form; eventually
-// we'll change 6l to do this for us, but it is easier to experiment
-// here than to change 6l and all the other tools.
+// Runtime symbol table parsing.
 //
-// The symbol table also needs to be better integrated with the type
-// strings table in the future.  This is just a quick way to get started
-// and figure out exactly what we want.
+// The Go tools use a symbol table derived from the Plan 9 symbol table
+// format. The symbol table is kept in its own section treated as
+// read-only memory when the binary is running: the binary consults the
+// table.
+// 
+// The format used by Go 1.0 was basically the Plan 9 format. Each entry
+// is variable sized but had this format:
+// 
+// 	4-byte value, big endian
+// 	1-byte type ([A-Za-z] + 0x80)
+// 	name, NUL terminated (or for 'z' and 'Z' entries, double-NUL terminated)
+// 	4-byte Go type address, big endian (new in Go)
+// 
+// In order to support greater interoperation with standard toolchains,
+// Go 1.1 uses a more flexible yet smaller encoding of the entries.
+// The overall structure is unchanged from Go 1.0 and, for that matter,
+// from Plan 9.
+// 
+// The Go 1.1 table is a re-encoding of the data in a Go 1.0 table.
+// To identify a new table as new, it begins one of two eight-byte
+// sequences:
+// 
+// 	FF FF FF FD 00 00 00 xx - big endian new table
+// 	FD FF FF FF 00 00 00 xx - little endian new table
+// 
+// This sequence was chosen because old tables stop at an entry with type
+// 0, so old code reading a new table will see only an empty table. The
+// first four bytes are the target-endian encoding of 0xfffffffd. The
+// final xx gives AddrSize, the width of a full-width address.
+// 
+// After that header, each entry is encoded as follows.
+// 
+// 	1-byte type (0-51 + two flag bits)
+// 	AddrSize-byte value, host-endian OR varint-encoded value
+// 	AddrSize-byte Go type address OR nothing
+// 	[n] name, terminated as before
+// 
+// The type byte comes first, but 'A' encodes as 0 and 'a' as 26, so that
+// the type itself is only in the low 6 bits. The upper two bits specify
+// the format of the next two fields. If the 0x40 bit is set, the value
+// is encoded as an full-width 4- or 8-byte target-endian word. Otherwise
+// the value is a varint-encoded number. If the 0x80 bit is set, the Go
+// type is present, again as a 4- or 8-byte target-endian word. If not,
+// there is no Go type in this entry. The NUL-terminated name ends the
+// entry.
 
 #include "runtime.h"
 #include "defs_GOOS_GOARCH.h"
 #include "os_GOOS.h"
 #include "arch_GOARCH.h"
+#include "malloc.h"
 
 extern byte pclntab[], epclntab[], symtab[], esymtab[];
 
@@ -28,24 +67,100 @@ struct Sym
 //	byte *gotype;
 };
 
+static uintptr mainoffset;
+
+// A dynamically allocated string containing multiple substrings.
+// Individual strings are slices of hugestring.
+static String hugestring;
+static int32 hugestring_len;
+
+extern void main·main(void);
+
+static uintptr
+readword(byte **pp, byte *ep)
+{
+	byte *p; 
+
+	p = *pp;
+	if(ep - p < sizeof(void*)) {
+		*pp = ep;
+		return 0;
+	}
+	*pp = p + sizeof(void*);
+
+	// Hairy, but only one of these four cases gets compiled.
+	if(sizeof(void*) == 8) {
+		if(BigEndian) {
+			return ((uint64)p[0]<<56) | ((uint64)p[1]<<48) | ((uint64)p[2]<<40) | ((uint64)p[3]<<32) |
+				((uint64)p[4]<<24) | ((uint64)p[5]<<16) | ((uint64)p[6]<<8) | ((uint64)p[7]);
+		}
+		return ((uint64)p[7]<<56) | ((uint64)p[6]<<48) | ((uint64)p[5]<<40) | ((uint64)p[4]<<32) |
+			((uint64)p[3]<<24) | ((uint64)p[2]<<16) | ((uint64)p[1]<<8) | ((uint64)p[0]);
+	}
+	if(BigEndian) {
+		return ((uint32)p[0]<<24) | ((uint32)p[1]<<16) | ((uint32)p[2]<<8) | ((uint32)p[3]);
+	}
+	return ((uint32)p[3]<<24) | ((uint32)p[2]<<16) | ((uint32)p[1]<<8) | ((uint32)p[0]);
+}
+
 // Walk over symtab, calling fn(&s) for each symbol.
 static void
 walksymtab(void (*fn)(Sym*))
 {
 	byte *p, *ep, *q;
 	Sym s;
+	int32 widevalue, havetype, shift;
 
 	p = symtab;
 	ep = esymtab;
+
+	// Table must begin with correct magic number.
+	if(ep - p < 8 || p[4] != 0x00 || p[5] != 0x00 || p[6] != 0x00 || p[7] != sizeof(void*))
+		return;
+	if(BigEndian) {
+		if(p[0] != 0xff || p[1] != 0xff || p[2] != 0xff || p[3] != 0xfd)
+			return;
+	} else {
+		if(p[0] != 0xfd || p[1] != 0xff || p[2] != 0xff || p[3] != 0xff)
+			return;
+	}
+	p += 8;
+
 	while(p < ep) {
-		if(p + 7 > ep)
-			break;
-		s.value = ((uint32)p[0]<<24) | ((uint32)p[1]<<16) | ((uint32)p[2]<<8) | ((uint32)p[3]);
+		s.symtype = p[0]&0x3F;
+		widevalue = p[0]&0x40;
+		havetype = p[0]&0x80;
+		if(s.symtype < 26)
+			s.symtype += 'A';
+		else
+			s.symtype += 'a' - 26;
+		p++;
+
+		// Value, either full-width or varint-encoded.
+		if(widevalue) {
+			s.value = readword(&p, ep);
+		} else {
+			s.value = 0;
+			shift = 0;
+			while(p < ep && (p[0]&0x80) != 0) {
+				s.value |= (uintptr)(p[0]&0x7F)<<shift;
+				shift += 7;
+				p++;
+			}
+			if(p >= ep)
+				break;
+			s.value |= (uintptr)p[0]<<shift;
+			p++;
+		}
+		
+		// Go type, if present. Ignored but must skip over.
+		if(havetype)
+			readword(&p, ep);
 
-		if(!(p[4]&0x80))
+		// Name.
+		if(ep - p < 2)
 			break;
-		s.symtype = p[4] & ~0x80;
-		p += 5;
+
 		s.name = p;
 		if(s.symtype == 'z' || s.symtype == 'Z') {
 			// path reference string - skip first byte,
@@ -65,7 +180,7 @@ walksymtab(void (*fn)(Sym*))
 				break;
 			p = q+1;
 		}
-		p += 4;	// go type
+	
 		fn(&s);
 	}
 }
@@ -80,12 +195,13 @@ static int32 nfname;
 
 static uint32 funcinit;
 static Lock funclock;
+static uintptr lastvalue;
 
 static void
 dofunc(Sym *sym)
 {
 	Func *f;
-
+	
 	switch(sym->symtype) {
 	case 't':
 	case 'T':
@@ -93,6 +209,11 @@ dofunc(Sym *sym)
 	case 'L':
 		if(runtime·strcmp(sym->name, (byte*)"etext") == 0)
 			break;
+		if(sym->value < lastvalue) {
+			runtime·printf("symbols out of order: %p before %p\n", lastvalue, sym->value);
+			runtime·throw("malformed symbol table");
+		}
+		lastvalue = sym->value;
 		if(func == nil) {
 			nfunc++;
 			break;
@@ -104,24 +225,24 @@ dofunc(Sym *sym)
 			f->frame = -sizeof(uintptr);
 		break;
 	case 'm':
-		if(nfunc > 0 && func != nil)
-			func[nfunc-1].frame += sym->value;
-		break;
-	case 'p':
-		if(nfunc > 0 && func != nil) {
-			f = &func[nfunc-1];
-			// args counts 32-bit words.
-			// sym->value is the arg's offset.
-			// don't know width of this arg, so assume it is 64 bits.
-			if(f->args < sym->value/4 + 2)
-				f->args = sym->value/4 + 2;
+		if(nfunc <= 0 || func == nil)
+			break;
+		if(runtime·strcmp(sym->name, (byte*)".frame") == 0)
+			func[nfunc-1].frame = sym->value;
+		else if(runtime·strcmp(sym->name, (byte*)".locals") == 0)
+			func[nfunc-1].locals = sym->value;
+		else if(runtime·strcmp(sym->name, (byte*)".args") == 0)
+			func[nfunc-1].args = sym->value;
+		else {
+			runtime·printf("invalid 'm' symbol named '%s'\n", sym->name);
+			runtime·throw("mangled symbol table");
 		}
 		break;
 	case 'f':
 		if(fname == nil) {
 			if(sym->value >= nfname) {
 				if(sym->value >= 0x10000) {
-					runtime·printf("invalid symbol file index %p\n", sym->value);
+					runtime·printf("runtime: invalid symbol file index %p\n", sym->value);
 					runtime·throw("mangled symbol table");
 				}
 				nfname = sym->value+1;
@@ -135,14 +256,15 @@ dofunc(Sym *sym)
 
 // put together the path name for a z entry.
 // the f entries have been accumulated into fname already.
-static void
+// returns the length of the path name.
+static int32
 makepath(byte *buf, int32 nbuf, byte *path)
 {
 	int32 n, len;
 	byte *p, *ep, *q;
 
 	if(nbuf <= 0)
-		return;
+		return 0;
 
 	p = buf;
 	ep = buf + nbuf;
@@ -163,6 +285,26 @@ makepath(byte *buf, int32 nbuf, byte *path)
 		runtime·memmove(p, q, len+1);
 		p += len;
 	}
+	return p - buf;
+}
+
+// appends p to hugestring
+static String
+gostringn(byte *p, int32 l)
+{
+	String s;
+
+	if(l == 0)
+		return runtime·emptystring;
+	if(hugestring.str == nil) {
+		hugestring_len += l;
+		return runtime·emptystring;
+	}
+	s.str = hugestring.str + hugestring.len;
+	s.len = l;
+	hugestring.len += s.len;
+	runtime·memmove(s.str, p, l);
+	return s;
 }
 
 // walk symtab accumulating path names for use by pc/ln table.
@@ -181,11 +323,13 @@ dosrcline(Sym *sym)
 	static int32 incstart;
 	static int32 nfunc, nfile, nhist;
 	Func *f;
-	int32 i;
+	int32 i, l;
 
 	switch(sym->symtype) {
 	case 't':
 	case 'T':
+		if(hugestring.str == nil)
+			break;
 		if(runtime·strcmp(sym->name, (byte*)"etext") == 0)
 			break;
 		f = &func[nfunc++];
@@ -200,23 +344,23 @@ dosrcline(Sym *sym)
 	case 'z':
 		if(sym->value == 1) {
 			// entry for main source file for a new object.
-			makepath(srcbuf, sizeof srcbuf, sym->name+1);
+			l = makepath(srcbuf, sizeof srcbuf, sym->name+1);
 			nhist = 0;
 			nfile = 0;
 			if(nfile == nelem(files))
 				return;
-			files[nfile].srcstring = runtime·gostring(srcbuf);
+			files[nfile].srcstring = gostringn(srcbuf, l);
 			files[nfile].aline = 0;
 			files[nfile++].delta = 0;
 		} else {
 			// push or pop of included file.
-			makepath(srcbuf, sizeof srcbuf, sym->name+1);
+			l = makepath(srcbuf, sizeof srcbuf, sym->name+1);
 			if(srcbuf[0] != '\0') {
 				if(nhist++ == 0)
 					incstart = sym->value;
 				if(nhist == 0 && nfile < nelem(files)) {
 					// new top-level file
-					files[nfile].srcstring = runtime·gostring(srcbuf);
+					files[nfile].srcstring = gostringn(srcbuf, l);
 					files[nfile].aline = sym->value;
 					// this is "line 0"
 					files[nfile++].delta = sym->value - 1;
@@ -275,7 +419,7 @@ splitpcln(void)
 			line += *p++;
 		else
 			line -= *p++ - 64;
-		
+
 		// pc, line now match.
 		// Because the state machine begins at pc==entry and line==0,
 		// it can happen - just at the beginning! - that the update may
@@ -297,7 +441,7 @@ splitpcln(void)
 			while(f < ef && pc >= (f+1)->entry);
 			f->pcln.array = p;
 			// pc0 and ln0 are the starting values for
-			// the loop over f->pcln, so pc must be 
+			// the loop over f->pcln, so pc must be
 			// adjusted by the same pcquant update
 			// that we're going to do as we continue our loop.
 			f->pc0 = pc + pcquant;
@@ -323,11 +467,11 @@ runtime·funcline(Func *f, uintptr targetpc)
 	uintptr pc;
 	int32 line;
 	int32 pcquant;
-	
+
 	enum {
 		debug = 0
 	};
-	
+
 	switch(thechar) {
 	case '5':
 		pcquant = 4;
@@ -354,7 +498,7 @@ runtime·funcline(Func *f, uintptr targetpc)
 
 		if(debug && !runtime·panicking)
 			runtime·printf("pc<%p targetpc=%p line=%d\n", pc, targetpc, line);
-		
+
 		// If the pc has advanced too far or we're out of data,
 		// stop and the last known line number.
 		if(pc > targetpc || p >= ep)
@@ -382,7 +526,7 @@ runtime·funcline(Func *f, uintptr targetpc)
 }
 
 void
-runtime·funcline_go(Func *f, uintptr targetpc, String retfile, int32 retline)
+runtime·funcline_go(Func *f, uintptr targetpc, String retfile, intgo retline)
 {
 	retfile = f->src;
 	retline = runtime·funcline(f, targetpc);
@@ -406,20 +550,30 @@ buildfuncs(void)
 	// count funcs, fnames
 	nfunc = 0;
 	nfname = 0;
+	lastvalue = 0;
 	walksymtab(dofunc);
 
-	// initialize tables
-	func = runtime·mal((nfunc+1)*sizeof func[0]);
+	// Initialize tables.
+	// Can use FlagNoPointers - all pointers either point into sections of the executable
+	// or point into hugestring.
+	func = runtime·mallocgc((nfunc+1)*sizeof func[0], FlagNoPointers, 0, 1);
 	func[nfunc].entry = (uint64)etext;
-	fname = runtime·mal(nfname*sizeof fname[0]);
+	fname = runtime·mallocgc(nfname*sizeof fname[0], FlagNoPointers, 0, 1);
 	nfunc = 0;
+	lastvalue = 0;
 	walksymtab(dofunc);
 
 	// split pc/ln table by func
 	splitpcln();
 
 	// record src file and line info for each func
-	walksymtab(dosrcline);
+	walksymtab(dosrcline);  // pass 1: determine hugestring_len
+	hugestring.str = runtime·mallocgc(hugestring_len, FlagNoPointers, 0, 0);
+	hugestring.len = 0;
+	walksymtab(dosrcline);  // pass 2: fill and use hugestring
+
+	if(hugestring.len != hugestring_len)
+		runtime·throw("buildfunc: problem in initialization procedure");
 
 	m->nomemprof--;
 }
@@ -482,7 +636,7 @@ static bool
 hasprefix(String s, int8 *p)
 {
 	int32 i;
-	
+
 	for(i=0; i<s.len; i++) {
 		if(p[i] == 0)
 			return 1;
@@ -496,7 +650,7 @@ static bool
 contains(String s, int8 *p)
 {
 	int32 i;
-	
+
 	if(p[0] == 0)
 		return 1;
 	for(i=0; i<s.len; i++) {
@@ -509,11 +663,13 @@ contains(String s, int8 *p)
 }
 
 bool
-runtime·showframe(Func *f)
+runtime·showframe(Func *f, bool current)
 {
 	static int32 traceback = -1;
-	
+
+	if(current && m->throwing > 0)
+		return 1;
 	if(traceback < 0)
 		traceback = runtime·gotraceback();
-	return traceback > 1 || contains(f->name, ".") && !hasprefix(f->name, "runtime.");
+	return traceback > 1 || f != nil && contains(f->name, ".") && !hasprefix(f->name, "runtime.");
 }
diff --git a/src/pkg/runtime/sys_darwin_386.s b/src/pkg/runtime/sys_darwin_386.s
index 3cf3506ad..8a938f9f4 100644
--- a/src/pkg/runtime/sys_darwin_386.s
+++ b/src/pkg/runtime/sys_darwin_386.s
@@ -47,8 +47,7 @@ TEXT runtime·mmap(SB),7,$0
 TEXT runtime·madvise(SB),7,$0
 	MOVL	$75, AX
 	INT	$0x80
-	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	// ignore failure - maybe pages are locked
 	RET
 
 TEXT runtime·munmap(SB),7,$0
@@ -63,40 +62,133 @@ TEXT runtime·setitimer(SB),7,$0
 	INT	$0x80
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), 7, $32
-	LEAL	12(SP), AX	// must be non-nil, unused
-	MOVL	AX, 4(SP)
-	MOVL	$0, 8(SP)	// time zone pointer
-	MOVL	$116, AX
-	INT	$0x80
-	MOVL	DX, BX
+// OS X comm page time offsets
+// http://www.opensource.apple.com/source/xnu/xnu-1699.26.8/osfmk/i386/cpu_capabilities.h
+#define	cpu_capabilities	0x20
+#define	nt_tsc_base	0x50
+#define	nt_scale	0x58
+#define	nt_shift	0x5c
+#define	nt_ns_base	0x60
+#define	nt_generation	0x68
+#define	gtod_generation	0x6c
+#define	gtod_ns_base	0x70
+#define	gtod_sec_base	0x78
+
+// called from assembly
+// 64-bit unix nanoseconds returned in DX:AX.
+// I'd much rather write this in C but we need
+// assembly for the 96-bit multiply and RDTSC.
+TEXT runtime·now(SB),7,$40
+	MOVL	$0xffff0000, BP /* comm page base */
+	
+	// Test for slow CPU. If so, the math is completely
+	// different, and unimplemented here, so use the
+	// system call.
+	MOVL	cpu_capabilities(BP), AX
+	TESTL	$0x4000, AX
+	JNZ	systime
+
+	// Loop trying to take a consistent snapshot
+	// of the time parameters.
+timeloop:
+	MOVL	gtod_generation(BP), BX
+	TESTL	BX, BX
+	JZ	systime
+	MOVL	nt_generation(BP), CX
+	TESTL	CX, CX
+	JZ	timeloop
+	RDTSC
+	MOVL	nt_tsc_base(BP), SI
+	MOVL	(nt_tsc_base+4)(BP), DI
+	MOVL	SI, 0(SP)
+	MOVL	DI, 4(SP)
+	MOVL	nt_scale(BP), SI
+	MOVL	SI, 8(SP)
+	MOVL	nt_ns_base(BP), SI
+	MOVL	(nt_ns_base+4)(BP), DI
+	MOVL	SI, 12(SP)
+	MOVL	DI, 16(SP)
+	CMPL	nt_generation(BP), CX
+	JNE	timeloop
+	MOVL	gtod_ns_base(BP), SI
+	MOVL	(gtod_ns_base+4)(BP), DI
+	MOVL	SI, 20(SP)
+	MOVL	DI, 24(SP)
+	MOVL	gtod_sec_base(BP), SI
+	MOVL	(gtod_sec_base+4)(BP), DI
+	MOVL	SI, 28(SP)
+	MOVL	DI, 32(SP)
+	CMPL	gtod_generation(BP), BX
+	JNE	timeloop
+
+	// Gathered all the data we need. Compute time.
+	//	((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
+	// The multiply and shift extracts the top 64 bits of the 96-bit product.
+	SUBL	0(SP), AX // DX:AX = (tsc - nt_tsc_base)
+	SBBL	4(SP), DX
+
+	// We have x = tsc - nt_tsc_base - DX:AX to be
+	// multiplied by y = nt_scale = 8(SP), keeping the top 64 bits of the 96-bit product.
+	// x*y = (x&0xffffffff)*y + (x&0xffffffff00000000)*y
+	// (x*y)>>32 = ((x&0xffffffff)*y)>>32 + (x>>32)*y
+	MOVL	DX, CX // SI = (x&0xffffffff)*y >> 32
+	MOVL	$0, DX
+	MULL	8(SP)
+	MOVL	DX, SI
 
-	// sec is in AX, usec in BX
-	MOVL	AX, sec+0(FP)
-	MOVL	$0, sec+4(FP)
-	IMULL	$1000, BX
-	MOVL	BX, nsec+8(FP)
+	MOVL	CX, AX // DX:AX = (x>>32)*y
+	MOVL	$0, DX
+	MULL	8(SP)
+
+	ADDL	SI, AX	// DX:AX += (x&0xffffffff)*y >> 32
+	ADCL	$0, DX
+	
+	// DX:AX is now ((tsc - nt_tsc_base) * nt_scale) >> 32.
+	ADDL	12(SP), AX	// DX:AX += nt_ns_base
+	ADCL	16(SP), DX
+	SUBL	20(SP), AX	// DX:AX -= gtod_ns_base
+	SBBL	24(SP), DX
+	MOVL	AX, SI	// DI:SI = DX:AX
+	MOVL	DX, DI
+	MOVL	28(SP), AX	// DX:AX = gtod_sec_base*1e9
+	MOVL	32(SP), DX
+	MOVL	$1000000000, CX
+	MULL	CX
+	ADDL	SI, AX	// DX:AX += DI:SI
+	ADCL	DI, DX
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), 7, $32
+systime:
+	// Fall back to system call (usually first call in this thread)
 	LEAL	12(SP), AX	// must be non-nil, unused
 	MOVL	AX, 4(SP)
 	MOVL	$0, 8(SP)	// time zone pointer
 	MOVL	$116, AX
 	INT	$0x80
-	MOVL	DX, BX
-
-	// sec is in AX, usec in BX
+	// sec is in AX, usec in DX
 	// convert to DX:AX nsec
+	MOVL	DX, BX
 	MOVL	$1000000000, CX
 	MULL	CX
 	IMULL	$1000, BX
 	ADDL	BX, AX
 	ADCL	$0, DX
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),7,$0
+	CALL	runtime·now(SB)
+	MOVL	$1000000000, CX
+	DIVL	CX
+	MOVL	AX, sec+0(FP)
+	MOVL	$0, sec+4(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
 
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB),7,$0
+	CALL	runtime·now(SB)
 	MOVL	ret+0(FP), DI
 	MOVL	AX, 0(DI)
 	MOVL	DX, 4(DI)
@@ -120,8 +212,8 @@ TEXT runtime·sigaction(SB),7,$0
 // It is called with the following arguments on the stack:
 //	0(FP)	"return address" - ignored
 //	4(FP)	actual handler
-//	8(FP)	siginfo style - ignored
-//	12(FP)	signal number
+//	8(FP)	signal number
+//	12(FP)	siginfo style
 //	16(FP)	siginfo
 //	20(FP)	context
 TEXT runtime·sigtramp(SB),7,$40
@@ -130,8 +222,11 @@ TEXT runtime·sigtramp(SB),7,$40
 	// check that m exists
 	MOVL	m(CX), BP
 	CMPL	BP, $0
-	JNE	2(PC)
+	JNE	5(PC)
+	MOVL	sig+8(FP), BX
+	MOVL	BX, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVL	g(CX), DI
@@ -196,7 +291,7 @@ TEXT runtime·usleep(SB),7,$32
 	INT	$0x80
 	RET
 
-// void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void))
+// void bsdthread_create(void *stk, M *mp, G *gp, void (*fn)(void))
 // System call args are: func arg stack pthread flags.
 TEXT runtime·bsdthread_create(SB),7,$32
 	MOVL	$360, AX
@@ -268,8 +363,10 @@ TEXT runtime·bsdthread_register(SB),7,$40
 	MOVL	$0, 20(SP)	// targetconc_ptr
 	MOVL	$0, 24(SP)	// dispatchqueue_offset
 	INT	$0x80
-	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	JAE	3(PC)
+	NEGL	AX
+	RET
+	MOVL	$0, AX
 	RET
 
 // Invoke Mach system call.
diff --git a/src/pkg/runtime/sys_darwin_amd64.s b/src/pkg/runtime/sys_darwin_amd64.s
index 90571baae..4e43a76c3 100644
--- a/src/pkg/runtime/sys_darwin_amd64.s
+++ b/src/pkg/runtime/sys_darwin_amd64.s
@@ -61,30 +61,63 @@ TEXT runtime·madvise(SB), 7, $0
 	MOVL	24(SP), DX		// arg 3 advice
 	MOVL	$(0x2000000+75), AX	// syscall entry madvise
 	SYSCALL
-	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	// ignore failure - maybe pages are locked
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), 7, $32
-	MOVQ	SP, DI	// must be non-nil, unused
-	MOVQ	$0, SI
-	MOVL	$(0x2000000+116), AX
-	SYSCALL
-
-	// sec is in AX, usec in DX
-	MOVQ	AX, sec+0(FP)
-	IMULQ	$1000, DX
-	MOVL	DX, nsec+8(FP)
-	RET
+// OS X comm page time offsets
+// http://www.opensource.apple.com/source/xnu/xnu-1699.26.8/osfmk/i386/cpu_capabilities.h
+#define	nt_tsc_base	0x50
+#define	nt_scale	0x58
+#define	nt_shift	0x5c
+#define	nt_ns_base	0x60
+#define	nt_generation	0x68
+#define	gtod_generation	0x6c
+#define	gtod_ns_base	0x70
+#define	gtod_sec_base	0x78
 
 // int64 nanotime(void)
 TEXT runtime·nanotime(SB), 7, $32
+	MOVQ	$0x7fffffe00000, BP	/* comm page base */
+	// Loop trying to take a consistent snapshot
+	// of the time parameters.
+timeloop:
+	MOVL	gtod_generation(BP), R8
+	TESTL	R8, R8
+	JZ	systime
+	MOVL	nt_generation(BP), R9
+	TESTL	R9, R9
+	JZ	timeloop
+	RDTSC
+	MOVQ	nt_tsc_base(BP), R10
+	MOVL	nt_scale(BP), R11
+	MOVQ	nt_ns_base(BP), R12
+	CMPL	nt_generation(BP), R9
+	JNE	timeloop
+	MOVQ	gtod_ns_base(BP), R13
+	MOVQ	gtod_sec_base(BP), R14
+	CMPL	gtod_generation(BP), R8
+	JNE	timeloop
+
+	// Gathered all the data we need. Compute time.
+	//	((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
+	// The multiply and shift extracts the top 64 bits of the 96-bit product.
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	SUBQ	R10, AX
+	MULQ	R11
+	SHRQ	$32, AX:DX
+	ADDQ	R12, AX
+	SUBQ	R13, AX
+	IMULQ	$1000000000, R14
+	ADDQ	R14, AX
+	RET
+
+systime:
+	// Fall back to system call (usually first call in this thread).
 	MOVQ	SP, DI	// must be non-nil, unused
 	MOVQ	$0, SI
 	MOVL	$(0x2000000+116), AX
 	SYSCALL
-
 	// sec is in AX, usec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
@@ -92,6 +125,25 @@ TEXT runtime·nanotime(SB), 7, $32
 	ADDQ	DX, AX
 	RET
 
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),7,$0
+	CALL	runtime·nanotime(SB)
+
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	MOVQ	$1360296554856532783, AX
+	MULQ	CX
+	ADDQ	CX, DX
+	RCRQ	$1, DX
+	SHRQ	$29, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
+	RET
+
 TEXT runtime·sigprocmask(SB),7,$0
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
@@ -120,8 +172,10 @@ TEXT runtime·sigtramp(SB),7,$64
 	// check that m exists
 	MOVQ	m(BX), BP
 	CMPQ	BP, $0
-	JNE	2(PC)
+	JNE	4(PC)
+	MOVL	DX, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVQ	g(BX), R10
@@ -199,7 +253,7 @@ TEXT runtime·usleep(SB),7,$16
 	SYSCALL
 	RET
 
-// void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void))
+// void bsdthread_create(void *stk, M *mp, G *gp, void (*fn)(void))
 TEXT runtime·bsdthread_create(SB),7,$0
 	// Set up arguments to bsdthread_create system call.
 	// The ones in quotes pass through to the thread callback
@@ -265,8 +319,10 @@ TEXT runtime·bsdthread_register(SB),7,$0
 	MOVQ	$0, R9	// dispatchqueue_offset
 	MOVQ	$(0x2000000+366), AX	// bsdthread_register
 	SYSCALL
-	JCC 2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	JCC 3(PC)
+	NEGL	AX
+	RET
+	MOVL	$0, AX
 	RET
 
 // Mach system calls use 0x1000000 instead of the BSD's 0x2000000.
diff --git a/src/pkg/runtime/sys_freebsd_386.s b/src/pkg/runtime/sys_freebsd_386.s
index a72d8972b..d5370267a 100644
--- a/src/pkg/runtime/sys_freebsd_386.s
+++ b/src/pkg/runtime/sys_freebsd_386.s
@@ -39,6 +39,7 @@ TEXT runtime·thr_start(SB),7,$0
 	MOVL	AX, m(CX)
 	CALL	runtime·stackcheck(SB)		// smashes AX
 	CALL	runtime·mstart(SB)
+
 	MOVL	0, AX			// crash (not reached)
 
 // Exit the entire program (like C exit)
@@ -89,7 +90,7 @@ TEXT runtime·mmap(SB),7,$32
 	MOVSL
 	MOVSL
 	MOVSL
-	MOVL	$0, AX	// top 64 bits of file offset
+	MOVL	$0, AX	// top 32 bits of file offset
 	STOSL
 	MOVL	$477, AX
 	INT	$0x80
@@ -102,6 +103,12 @@ TEXT runtime·munmap(SB),7,$-4
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+TEXT runtime·madvise(SB),7,$-4
+	MOVL	$75, AX	// madvise
+	INT	$0x80
+	// ignore failure - maybe pages are locked
+	RET
+
 TEXT runtime·setitimer(SB), 7, $-4
 	MOVL	$83, AX
 	INT	$0x80
@@ -109,40 +116,38 @@ TEXT runtime·setitimer(SB), 7, $-4
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	MOVL	$116, AX
+	MOVL	$232, AX
 	LEAL	12(SP), BX
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)
+	MOVL	$0, 4(SP)
+	MOVL	BX, 8(SP)
 	INT	$0x80
 	MOVL	12(SP), AX	// sec
-	MOVL	16(SP), BX	// usec
+	MOVL	16(SP), BX	// nsec
 
-	// sec is in AX, usec in BX
+	// sec is in AX, nsec in BX
 	MOVL	AX, sec+0(FP)
 	MOVL	$0, sec+4(FP)
-	IMULL	$1000, BX
 	MOVL	BX, nsec+8(FP)
 	RET
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB), 7, $32
-	MOVL	$116, AX
+	MOVL	$232, AX
 	LEAL	12(SP), BX
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)
+	MOVL	$0, 4(SP)
+	MOVL	BX, 8(SP)
 	INT	$0x80
 	MOVL	12(SP), AX	// sec
-	MOVL	16(SP), BX	// usec
+	MOVL	16(SP), BX	// nsec
 
-	// sec is in AX, usec in BX
+	// sec is in AX, nsec in BX
 	// convert to DX:AX nsec
 	MOVL	$1000000000, CX
 	MULL	CX
-	IMULL	$1000, BX
 	ADDL	BX, AX
 	ADCL	$0, DX
-	
+
 	MOVL	ret+0(FP), DI
 	MOVL	AX, 0(DI)
 	MOVL	DX, 4(DI)
@@ -162,8 +167,11 @@ TEXT runtime·sigtramp(SB),7,$44
 	// check that m exists
 	MOVL	m(CX), BX
 	CMPL	BX, $0
-	JNE	2(PC)
+	JNE	5(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVL	g(CX), DI
diff --git a/src/pkg/runtime/sys_freebsd_amd64.s b/src/pkg/runtime/sys_freebsd_amd64.s
index 36e034a80..40c6237e2 100644
--- a/src/pkg/runtime/sys_freebsd_amd64.s
+++ b/src/pkg/runtime/sys_freebsd_amd64.s
@@ -38,8 +38,9 @@ TEXT runtime·thr_start(SB),7,$0
 	MOVQ	m_g0(R13), DI
 	MOVQ	DI, g(CX)
 
-	CALL runtime·stackcheck(SB)
-	CALL runtime·mstart(SB)
+	CALL	runtime·stackcheck(SB)
+	CALL	runtime·mstart(SB)
+
 	MOVQ 0, AX			// crash (not reached)
 
 // Exit the entire program (like C exit)
@@ -94,31 +95,29 @@ TEXT runtime·setitimer(SB), 7, $-8
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	MOVL	$116, AX
-	LEAQ	8(SP), DI
-	MOVQ	$0, SI
+	MOVL	$232, AX
+	MOVQ	$0, DI
+	LEAQ	8(SP), SI
 	SYSCALL
 	MOVQ	8(SP), AX	// sec
-	MOVL	16(SP), DX	// usec
+	MOVQ	16(SP), DX	// nsec
 
-	// sec is in AX, usec in DX
+	// sec is in AX, nsec in DX
 	MOVQ	AX, sec+0(FP)
-	IMULQ	$1000, DX
 	MOVL	DX, nsec+8(FP)
 	RET
 
 TEXT runtime·nanotime(SB), 7, $32
-	MOVL	$116, AX
-	LEAQ	8(SP), DI
-	MOVQ	$0, SI
+	MOVL	$232, AX
+	MOVQ	$0, DI
+	LEAQ	8(SP), SI
 	SYSCALL
 	MOVQ	8(SP), AX	// sec
-	MOVL	16(SP), DX	// usec
+	MOVQ	16(SP), DX	// nsec
 
-	// sec is in AX, usec in DX
+	// sec is in AX, nsec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
-	IMULQ	$1000, DX
 	ADDQ	DX, AX
 	RET
 
@@ -138,8 +137,10 @@ TEXT runtime·sigtramp(SB),7,$64
 	// check that m exists
 	MOVQ	m(BX), BP
 	CMPQ	BP, $0
-	JNE	2(PC)
+	JNE	4(PC)
+	MOVQ	DI, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVQ	g(BX), R10
@@ -182,6 +183,15 @@ TEXT runtime·munmap(SB),7,$0
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+TEXT runtime·madvise(SB),7,$0
+	MOVQ	8(SP), DI
+	MOVQ	16(SP), SI
+	MOVQ	24(SP), DX
+	MOVQ	$75, AX	// madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+	
 TEXT runtime·sigaltstack(SB),7,$-8
 	MOVQ	new+8(SP), DI
 	MOVQ	old+16(SP), SI
diff --git a/src/pkg/runtime/sys_freebsd_arm.s b/src/pkg/runtime/sys_freebsd_arm.s
new file mode 100644
index 000000000..77050e8d0
--- /dev/null
+++ b/src/pkg/runtime/sys_freebsd_arm.s
@@ -0,0 +1,260 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for ARM, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+	
+TEXT runtime·sys_umtx_op(SB),7,$0
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	MOVW 12(FP), R3
+	ADD $20, R13 // arg 5 is passed on stack
+	SWI $454
+	SUB $20, R13
+	// BCS error
+	RET
+
+TEXT runtime·thr_new(SB),7,$0
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	SWI $455
+	RET
+
+TEXT runtime·thr_start(SB),7,$0
+	MOVW R0, R9 // m
+
+	// TODO(minux): set up TLS?
+
+	// set up g
+	MOVW m_g0(R9), R10
+	BL runtime·emptyfunc(SB) // fault if stack check is wrong
+	BL runtime·mstart(SB)
+
+	MOVW $2, R9  // crash (not reached)
+	MOVW R9, (R9)
+	RET
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),7,$-8
+	MOVW 0(FP), R0	// arg 1 exit status
+	SWI $1
+	MOVW.CS $0, R9 // crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·exit1(SB),7,$-8
+	MOVW 0(FP), R0	// arg 1 exit status
+	SWI $431
+	MOVW.CS $0, R9 // crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·write(SB),7,$-8
+	MOVW 0(FP), R0	// arg 1 fd
+	MOVW 4(FP), R1	// arg 2 buf
+	MOVW 8(FP), R2	// arg 3 count
+	SWI $4
+	RET
+
+TEXT runtime·getrlimit(SB),7,$-8
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	SWI $194
+	RET
+
+TEXT runtime·raisesigpipe(SB),7,$8
+	// thr_self(&4(R13))
+	MOVW $4(R13), R0 // arg 1 &4(R13)
+	SWI $432
+	// thr_kill(self, SIGPIPE)
+	MOVW 4(R13), R0	// arg 1 id
+	MOVW $13, R1	// arg 2 SIGPIPE
+	SWI $433
+	RET
+
+TEXT runtime·setitimer(SB), 7, $-8
+	MOVW 0(FP), R0
+	MOVW 4(FP), R1
+	MOVW 8(FP), R2
+	SWI $83
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), 7, $32
+	MOVW $0, R0 // CLOCK_REALTIME
+	MOVW $8(R13), R1
+	SWI $232 // clock_gettime
+
+	MOVW 8(R13), R0 // sec.low
+	MOVW 12(R13), R1 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW R0, 0(FP)
+	MOVW R1, 4(FP)
+	MOVW R2, 8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), 7, $32
+	MOVW $0, R0 // CLOCK_REALTIME
+	MOVW $8(R13), R1
+	SWI $232 // clock_gettime
+
+	MOVW 8(R13), R0 // sec.low
+	MOVW 12(R13), R4 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW $1000000000, R3
+	MULLU R0, R3, (R1, R0)
+	MUL R3, R4
+	ADD.S R2, R0
+	ADC R4, R1
+
+	MOVW 0(FP), R3
+	MOVW R0, 0(R3)
+	MOVW R1, 4(R3)
+	RET
+
+TEXT runtime·sigaction(SB),7,$-8
+	MOVW 0(FP), R0		// arg 1 sig
+	MOVW 4(FP), R1		// arg 2 act
+	MOVW 8(FP), R2		// arg 3 oact
+	SWI $416
+	MOVW.CS $0, R9 // crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·sigtramp(SB),7,$24
+	// this might be called in external code context,
+	// where g and m are not set.
+	// first save R0, because _cgo_load_gm will clobber it
+	// TODO(adonovan): call runtime·badsignal if m=0, like other platforms?
+	MOVW	R0, 4(R13) // signum
+	MOVW	_cgo_load_gm(SB), R0
+	CMP 	$0, R0
+	BL.NE	(R0)
+
+	// save g
+	MOVW R10, R4
+	MOVW R10, 20(R13)
+
+	// g = m->signal
+	MOVW m_gsignal(R9), R10
+
+	// R0 is already saved
+	MOVW R1, 8(R13) // info
+	MOVW R2, 12(R13) // context
+	MOVW R4, 16(R13) // oldg
+
+	BL runtime·sighandler(SB)
+
+	// restore g
+	MOVW 20(R13), R10
+	RET
+
+TEXT runtime·mmap(SB),7,$12
+	MOVW 0(FP), R0		// arg 1 addr
+	MOVW 4(FP), R1		// arg 2 len
+	MOVW 8(FP), R2		// arg 3 prot
+	MOVW 12(FP), R3		// arg 4 flags
+	// arg 5 (fid) and arg6 (offset_lo, offset_hi) are passed on stack
+	// note the C runtime only passes the 32-bit offset_lo to us
+	MOVW 16(FP), R4		// arg 5
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R5		// arg 6 lower 32-bit
+	MOVW R5, 8(R13)
+	MOVW $0, R6 // higher 32-bit for arg 6
+	MOVW R6, 12(R13)
+	ADD $4, R13 // pass arg 5 and arg 6 on stack
+	SWI $477
+	SUB $4, R13
+	RET
+
+TEXT runtime·munmap(SB),7,$0
+	MOVW 0(FP), R0		// arg 1 addr
+	MOVW 4(FP), R1		// arg 2 len
+	SWI $73
+	MOVW.CS $0, R9 // crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·madvise(SB),7,$0
+	MOVW 0(FP), R0		// arg 1 addr
+	MOVW 4(FP), R1		// arg 2 len
+	MOVW 8(FP), R2		// arg 3 flags
+	SWI $75
+	// ignore failure - maybe pages are locked
+	RET
+	
+TEXT runtime·sigaltstack(SB),7,$-8
+	MOVW new+0(FP), R0
+	MOVW old+4(FP), R1
+	SWI $53
+	MOVW.CS $0, R9 // crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·usleep(SB),7,$16
+	MOVW usec+0(FP), R0
+	MOVW R0, R2
+	MOVW $1000000, R1
+	DIV R1, R0
+	// 0(R13) is the saved LR, don't use it
+	MOVW R0, 4(R13) // tv_sec.low
+	MOVW $0, R0
+	MOVW R0, 8(R13) // tv_sec.high
+	MOD R1, R2
+	MOVW $1000, R1
+	MUL R1, R2
+	MOVW R2, 12(R13) // tv_nsec
+
+	MOVW $4(R13), R0 // arg 1 - rqtp
+	MOVW $0, R1      // arg 2 - rmtp
+	SWI $240 // sys_nanosleep
+	RET
+
+TEXT runtime·sysctl(SB),7,$0
+	MOVW 0(FP), R0	// arg 1 - name
+	MOVW 4(FP), R1	// arg 2 - namelen
+	MOVW 8(FP), R2	// arg 3 - oldp
+	MOVW 12(FP), R3	// arg 4 - oldlenp
+	// arg 5 (newp) and arg 6 (newlen) are passed on stack
+	ADD $20, R13
+	SWI $202 // sys___sysctl
+	SUB.CS $0, R0, R0
+	SUB $20, R13
+	RET
+
+TEXT runtime·osyield(SB),7,$-4
+	SWI $331	// sys_sched_yield
+	RET
+
+TEXT runtime·sigprocmask(SB),7,$0
+	MOVW $3, R0	// arg 1 - how (SIG_SETMASK)
+	MOVW 0(FP), R1	// arg 2 - set
+	MOVW 4(FP), R2	// arg 3 - oset
+	SWI $340	// sys_sigprocmask
+	MOVW.CS $0, R9 // crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·casp(SB),7,$0
+	B	runtime·cas(SB)
+
+// TODO(minux): this is only valid for ARMv6+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·cas(SB),7,$0
+	B runtime·armcas(SB)
diff --git a/src/pkg/runtime/sys_linux_386.s b/src/pkg/runtime/sys_linux_386.s
index 602d9ddac..f27fd4713 100644
--- a/src/pkg/runtime/sys_linux_386.s
+++ b/src/pkg/runtime/sys_linux_386.s
@@ -104,40 +104,38 @@ TEXT runtime·mincore(SB),7,$0-24
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	MOVL	$78, AX			// syscall - gettimeofday
-	LEAL	8(SP), BX
-	MOVL	$0, CX
+	MOVL	$265, AX			// syscall - clock_gettime
+	MOVL	$0, BX
+	LEAL	8(SP), CX
 	MOVL	$0, DX
 	CALL	*runtime·_vdso(SB)
 	MOVL	8(SP), AX	// sec
-	MOVL	12(SP), BX	// usec
+	MOVL	12(SP), BX	// nsec
 
-	// sec is in AX, usec in BX
+	// sec is in AX, nsec in BX
 	MOVL	AX, sec+0(FP)
 	MOVL	$0, sec+4(FP)
-	IMULL	$1000, BX
 	MOVL	BX, nsec+8(FP)
 	RET
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB), 7, $32
-	MOVL	$78, AX			// syscall - gettimeofday
-	LEAL	8(SP), BX
-	MOVL	$0, CX
+	MOVL	$265, AX			// syscall - clock_gettime
+	MOVL	$0, BX
+	LEAL	8(SP), CX
 	MOVL	$0, DX
 	CALL	*runtime·_vdso(SB)
 	MOVL	8(SP), AX	// sec
-	MOVL	12(SP), BX	// usec
+	MOVL	12(SP), BX	// nsec
 
-	// sec is in AX, usec in BX
+	// sec is in AX, nsec in BX
 	// convert to DX:AX nsec
 	MOVL	$1000000000, CX
 	MULL	CX
-	IMULL	$1000, BX
 	ADDL	BX, AX
 	ADCL	$0, DX
-	
+
 	MOVL	ret+0(FP), DI
 	MOVL	AX, 0(DI)
 	MOVL	DX, 4(DI)
@@ -170,8 +168,11 @@ TEXT runtime·sigtramp(SB),7,$44
 	// check that m exists
 	MOVL	m(CX), BX
 	CMPL	BX, $0
-	JNE	2(PC)
+	JNE	5(PC)
+	MOVL	sig+0(FP), BX
+	MOVL	BX, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVL	g(CX), DI
@@ -240,9 +241,7 @@ TEXT runtime·madvise(SB),7,$0
 	MOVL	8(SP), CX
 	MOVL	12(SP), DX
 	CALL	*runtime·_vdso(SB)
-	CMPL	AX, $0xfffff001
-	JLS	2(PC)
-	INT $3
+	// ignore failure - maybe pages are locked
 	RET
 
 // int32 futex(int32 *uaddr, int32 op, int32 val,
@@ -258,7 +257,7 @@ TEXT runtime·futex(SB),7,$0
 	CALL	*runtime·_vdso(SB)
 	RET
 
-// int32 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
+// int32 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
 TEXT runtime·clone(SB),7,$0
 	MOVL	$120, AX	// clone
 	MOVL	flags+4(SP), BX
@@ -266,7 +265,7 @@ TEXT runtime·clone(SB),7,$0
 	MOVL	$0, DX	// parent tid ptr
 	MOVL	$0, DI	// child tid ptr
 
-	// Copy m, g, fn off parent stack for use by child.
+	// Copy mp, gp, fn off parent stack for use by child.
 	SUBL	$16, CX
 	MOVL	mm+12(SP), SI
 	MOVL	SI, 0(CX)
@@ -423,3 +422,11 @@ TEXT runtime·osyield(SB),7,$0
 	MOVL	$158, AX
 	CALL	*runtime·_vdso(SB)
 	RET
+
+TEXT runtime·sched_getaffinity(SB),7,$0
+	MOVL	$242, AX		// syscall - sched_getaffinity
+	MOVL	4(SP), BX
+	MOVL	8(SP), CX
+	MOVL	12(SP), DX
+	CALL	*runtime·_vdso(SB)
+	RET
diff --git a/src/pkg/runtime/sys_linux_amd64.s b/src/pkg/runtime/sys_linux_amd64.s
index 657ab7e0b..e45943758 100644
--- a/src/pkg/runtime/sys_linux_amd64.s
+++ b/src/pkg/runtime/sys_linux_amd64.s
@@ -101,32 +101,61 @@ TEXT runtime·mincore(SB),7,$0-24
 	RET
 
 // func now() (sec int64, nsec int32)
-TEXT time·now(SB), 7, $32
-	LEAQ	8(SP), DI
-	MOVQ	$0, SI
-	MOVQ	$0xffffffffff600000, AX
+TEXT time·now(SB),7,$16
+	// Be careful. We're calling a function with gcc calling convention here.
+	// We're guaranteed 128 bytes on entry, and we've taken 16, and the
+	// call uses another 8.
+	// That leaves 104 for the gettime code to use. Hope that's enough!
+	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback_gtod
+	MOVL	$0, DI // CLOCK_REALTIME
+	LEAQ	0(SP), SI
 	CALL	AX
-	MOVQ	8(SP), AX	// sec
-	MOVL	16(SP), DX	// usec
-
-	// sec is in AX, usec in DX
+	MOVQ	0(SP), AX	// sec
+	MOVQ	8(SP), DX	// nsec
 	MOVQ	AX, sec+0(FP)
-	IMULQ	$1000, DX
 	MOVL	DX, nsec+8(FP)
 	RET
-
-TEXT runtime·nanotime(SB), 7, $32
-	LEAQ	8(SP), DI
+fallback_gtod:
+	LEAQ	0(SP), DI
 	MOVQ	$0, SI
-	MOVQ	$0xffffffffff600000, AX
+	MOVQ	runtime·__vdso_gettimeofday_sym(SB), AX
 	CALL	AX
-	MOVQ	8(SP), AX	// sec
-	MOVL	16(SP), DX	// usec
+	MOVQ	0(SP), AX	// sec
+	MOVL	8(SP), DX	// usec
+	IMULQ	$1000, DX
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
+	RET
 
-	// sec is in AX, usec in DX
+TEXT runtime·nanotime(SB),7,$16
+	// Duplicate time.now here to avoid using up precious stack space.
+	// See comment above in time.now.
+	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
+	CMPQ	AX, $0
+	JEQ	fallback_gtod_nt
+	MOVL	$0, DI // CLOCK_REALTIME
+	LEAQ	0(SP), SI
+	CALL	AX
+	MOVQ	0(SP), AX	// sec
+	MOVQ	8(SP), DX	// nsec
+	// sec is in AX, nsec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
+	ADDQ	DX, AX
+	RET
+fallback_gtod_nt:
+	LEAQ	0(SP), DI
+	MOVQ	$0, SI
+	MOVQ	runtime·__vdso_gettimeofday_sym(SB), AX
+	CALL	AX
+	MOVQ	0(SP), AX	// sec
+	MOVL	8(SP), DX	// usec
 	IMULQ	$1000, DX
+	// sec is in AX, nsec in DX
+	// return nsec in AX
+	IMULQ	$1000000000, AX
 	ADDQ	DX, AX
 	RET
 
@@ -157,8 +186,10 @@ TEXT runtime·sigtramp(SB),7,$64
 	// check that m exists
 	MOVQ	m(BX), BP
 	CMPQ	BP, $0
-	JNE	2(PC)
+	JNE	4(PC)
+	MOVQ	DI, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVQ	g(BX), R10
@@ -219,9 +250,7 @@ TEXT runtime·madvise(SB),7,$0
 	MOVQ	24(SP), DX
 	MOVQ	$28, AX	// madvise
 	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	// ignore failure - maybe pages are locked
 	RET
 
 // int64 futex(int32 *uaddr, int32 op, int32 val,
@@ -237,12 +266,12 @@ TEXT runtime·futex(SB),7,$0
 	SYSCALL
 	RET
 
-// int64 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
+// int64 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
 TEXT runtime·clone(SB),7,$0
 	MOVL	flags+8(SP), DI
 	MOVQ	stack+16(SP), SI
 
-	// Copy m, g, fn off parent stack for use by child.
+	// Copy mp, gp, fn off parent stack for use by child.
 	// Careful: Linux system call clobbers CX and R11.
 	MOVQ	mm+24(SP), R8
 	MOVQ	gg+32(SP), R9
@@ -310,3 +339,11 @@ TEXT runtime·osyield(SB),7,$0
 	MOVL	$24, AX
 	SYSCALL
 	RET
+
+TEXT runtime·sched_getaffinity(SB),7,$0
+	MOVQ	8(SP), DI
+	MOVL	16(SP), SI
+	MOVQ	24(SP), DX
+	MOVL	$204, AX			// syscall entry
+	SYSCALL
+	RET
diff --git a/src/pkg/runtime/sys_linux_arm.s b/src/pkg/runtime/sys_linux_arm.s
index 03e173d26..8bae2933f 100644
--- a/src/pkg/runtime/sys_linux_arm.s
+++ b/src/pkg/runtime/sys_linux_arm.s
@@ -34,9 +34,10 @@
 #define SYS_sched_yield (SYS_BASE + 158)
 #define SYS_select (SYS_BASE + 142) // newselect
 #define SYS_ugetrlimit (SYS_BASE + 191)
+#define SYS_sched_getaffinity (SYS_BASE + 242)
+#define SYS_clock_gettime (SYS_BASE + 263)
 
 #define ARM_BASE (SYS_BASE + 0x0f0000)
-#define SYS_ARM_cacheflush (ARM_BASE + 2)
 
 TEXT runtime·open(SB),7,$0
 	MOVW	0(FP), R0
@@ -131,10 +132,7 @@ TEXT runtime·madvise(SB),7,$0
 	MOVW	8(FP), R2
 	MOVW	$SYS_madvise, R7
 	SWI	$0
-	MOVW	$0xfffff001, R6
-	CMP 	R6, R0
-	MOVW.HI	$0, R9  // crash on syscall failure
-	MOVW.HI	R9, (R9)
+	// ignore failure - maybe pages are locked
 	RET
 
 TEXT runtime·setitimer(SB),7,$0
@@ -154,41 +152,37 @@ TEXT runtime·mincore(SB),7,$0
 	RET
 
 TEXT time·now(SB), 7, $32
-	MOVW	$8(R13), R0  // timeval
-	MOVW	$0, R1  // zone
-	MOVW	$SYS_gettimeofday, R7
+	MOVW	$0, R0  // CLOCK_REALTIME
+	MOVW	$8(R13), R1  // timespec
+	MOVW	$SYS_clock_gettime, R7
 	SWI	$0
 	
 	MOVW	8(R13), R0  // sec
-	MOVW	12(R13), R2  // usec
+	MOVW	12(R13), R2  // nsec
 	
 	MOVW	R0, 0(FP)
 	MOVW	$0, R1
 	MOVW	R1, 4(FP)
-	MOVW	$1000, R3
-	MUL	R3, R2
 	MOVW	R2, 8(FP)
 	RET	
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB),7,$32
-	MOVW	$8(R13), R0  // timeval
-	MOVW	$0, R1  // zone
-	MOVW	$SYS_gettimeofday, R7
+	MOVW	$0, R0  // CLOCK_REALTIME
+	MOVW	$8(R13), R1  // timespec
+	MOVW	$SYS_clock_gettime, R7
 	SWI	$0
 	
 	MOVW	8(R13), R0  // sec
-	MOVW	12(R13), R2  // usec
+	MOVW	12(R13), R2  // nsec
 	
 	MOVW	$1000000000, R3
 	MULLU	R0, R3, (R1, R0)
-	MOVW	$1000, R3
 	MOVW	$0, R4
-	MUL	R3, R2
 	ADD.S	R2, R0
 	ADC	R4, R1
-	
+
 	MOVW	0(FP), R3
 	MOVW	R0, 0(R3)
 	MOVW	R1, 4(R3)
@@ -208,7 +202,7 @@ TEXT runtime·futex(SB),7,$0
 	RET
 
 
-// int32 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
+// int32 clone(int32 flags, void *stack, M *mp, G *gp, void (*fn)(void));
 TEXT runtime·clone(SB),7,$0
 	MOVW	flags+0(FP), R0
 	MOVW	stack+4(FP), R1
@@ -217,7 +211,7 @@ TEXT runtime·clone(SB),7,$0
 	MOVW	$0, R4	// child tid ptr
 	MOVW	$0, R5
 
-	// Copy m, g, fn off parent stack for use by child.
+	// Copy mp, gp, fn off parent stack for use by child.
 	// TODO(kaib): figure out which registers are clobbered by clone and avoid stack copying
 	MOVW	$-16(R1), R1
 	MOVW	mm+8(FP), R6
@@ -272,15 +266,6 @@ TEXT runtime·clone(SB),7,$0
 	MOVW	$1005, R1
 	MOVW	R0, (R1)
 
-
-TEXT runtime·cacheflush(SB),7,$0
-	MOVW	0(FP), R0
-	MOVW	4(FP), R1
-	MOVW	$0, R2
-	MOVW	$SYS_ARM_cacheflush, R7
-	SWI	$0
-	RET
-
 TEXT runtime·sigaltstack(SB),7,$0
 	MOVW	0(FP), R0
 	MOVW	4(FP), R1
@@ -293,6 +278,15 @@ TEXT runtime·sigaltstack(SB),7,$0
 	RET
 
 TEXT runtime·sigtramp(SB),7,$24
+	// this might be called in external code context,
+	// where g and m are not set.
+	// first save R0, because _cgo_load_gm will clobber it
+	// TODO(adonovan): call runtime·badsignal if m=0, like other platforms?
+	MOVW	R0, 4(R13)
+	MOVW	_cgo_load_gm(SB), R0
+	CMP 	$0, R0
+	BL.NE	(R0)
+
 	// save g
 	MOVW	g, R3
 	MOVW	g, 20(R13)
@@ -301,7 +295,7 @@ TEXT runtime·sigtramp(SB),7,$24
 	MOVW	m_gsignal(m), g
 
 	// copy arguments for call to sighandler
-	MOVW	R0, 4(R13)
+	// R0 is already saved above
 	MOVW	R1, 8(R13)
 	MOVW	R2, 12(R13)
 	MOVW	R3, 16(R13)
@@ -385,3 +379,11 @@ TEXT runtime·osyield(SB),7,$0
 	MOVW	$SYS_sched_yield, R7
 	SWI	$0
 	RET
+
+TEXT runtime·sched_getaffinity(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_sched_getaffinity, R7
+	SWI	$0
+	RET
diff --git a/src/pkg/runtime/sys_netbsd_386.s b/src/pkg/runtime/sys_netbsd_386.s
index 11f8c7aaa..3d3d31273 100644
--- a/src/pkg/runtime/sys_netbsd_386.s
+++ b/src/pkg/runtime/sys_netbsd_386.s
@@ -12,14 +12,14 @@
 TEXT runtime·exit(SB),7,$-4
 	MOVL	$1, AX
 	INT	$0x80
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·exit1(SB),7,$-4
-	MOVL	$302, AX		// sys_threxit
+	MOVL	$310, AX		// sys__lwp_exit
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·write(SB),7,$-4
@@ -27,31 +27,32 @@ TEXT runtime·write(SB),7,$-4
 	INT	$0x80
 	RET
 
-TEXT runtime·usleep(SB),7,$20
+TEXT runtime·usleep(SB),7,$24
 	MOVL	$0, DX
 	MOVL	usec+0(FP), AX
 	MOVL	$1000000, CX
 	DIVL	CX
-	MOVL	AX, 12(SP)		// tv_sec
+	MOVL	AX, 12(SP)		// tv_sec - l32
+	MOVL	$0, 16(SP)		// tv_sec - h32
 	MOVL	$1000, AX
 	MULL	DX
-	MOVL	AX, 16(SP)		// tv_nsec
+	MOVL	AX, 20(SP)		// tv_nsec
 
 	MOVL	$0, 0(SP)
 	LEAL	12(SP), AX
 	MOVL	AX, 4(SP)		// arg 1 - rqtp
 	MOVL	$0, 8(SP)		// arg 2 - rmtp
-	MOVL	$240, AX		// sys_nanosleep
+	MOVL	$430, AX		// sys_nanosleep
 	INT	$0x80
 	RET
 
 TEXT runtime·raisesigpipe(SB),7,$12
-	MOVL	$299, AX		// sys_getthrid
+	MOVL	$311, AX		// sys__lwp_self
 	INT	$0x80
 	MOVL	$0, 0(SP)
-	MOVL	AX, 4(SP)		// arg 1 - pid
-	MOVL	$13, 8(SP)		// arg 2 - signum == SIGPIPE
-	MOVL	$37, AX			// sys_kill
+	MOVL	AX, 4(SP)		// arg 1 - target
+	MOVL	$13, 8(SP)		// arg 2 - signo == SIGPIPE
+	MOVL	$318, AX		// sys__lwp_kill
 	INT	$0x80
 	RET
 
@@ -67,7 +68,7 @@ TEXT runtime·mmap(SB),7,$36
 	MOVL	$0, AX
 	STOSL				// arg 6 - pad
 	MOVSL				// arg 7 - offset
-	MOVL	$0, AX			// top 64 bits of file offset
+	MOVL	$0, AX			// top 32 bits of file offset
 	STOSL
 	MOVL	$197, AX		// sys_mmap
 	INT	$0x80
@@ -79,60 +80,100 @@ TEXT runtime·munmap(SB),7,$-4
 	MOVL	$73, AX			// sys_munmap
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·madvise(SB),7,$-4
+	MOVL	$75, AX			// sys_madvise
+	INT	$0x80
+	// ignore failure - maybe pages are locked
 	RET
 
 TEXT runtime·setitimer(SB),7,$-4
-	MOVL	$83, AX
+	MOVL	$425, AX		// sys_setitimer
 	INT	$0x80
 	RET
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	MOVL	$116, AX
 	LEAL	12(SP), BX
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)
+	MOVL	$0, 4(SP)		// arg 1 - clock_id
+	MOVL	BX, 8(SP)		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
 	INT	$0x80
-	MOVL	12(SP), AX		// sec
-	MOVL	16(SP), BX		// usec
 
-	// sec is in AX, usec in BX
+	MOVL	12(SP), AX		// sec - l32
 	MOVL	AX, sec+0(FP)
-	MOVL	$0, sec+4(FP)
-	IMULL	$1000, BX
+	MOVL	16(SP), AX		// sec - h32
+	MOVL	AX, sec+4(FP)
+
+	MOVL	20(SP), BX		// nsec
 	MOVL	BX, nsec+8(FP)
 	RET
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB),7,$32
-	MOVL	$116, AX
 	LEAL	12(SP), BX
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)
+	MOVL	$0, 4(SP)		// arg 1 - clock_id
+	MOVL	BX, 8(SP)		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
 	INT	$0x80
-	MOVL	12(SP), AX		// sec
-	MOVL	16(SP), BX		// usec
-
-	// sec is in AX, usec in BX
-	// convert to DX:AX nsec
-	MOVL	$1000000000, CX
-	MULL	CX
-	IMULL	$1000, BX
+
+	MOVL	16(SP), CX		// sec - h32
+	IMULL	$1000000000, CX
+
+	MOVL	12(SP), AX		// sec - l32
+	MOVL	$1000000000, BX
+	MULL	BX			// result in dx:ax
+
+	MOVL	20(SP), BX		// nsec
 	ADDL	BX, AX
-	ADCL	$0, DX
-	
+	ADCL	CX, DX			// add high bits with carry
+
 	MOVL	ret+0(FP), DI
 	MOVL	AX, 0(DI)
 	MOVL	DX, 4(DI)
 	RET
 
-TEXT runtime·sigaction(SB),7,$-4
-	MOVL	$46, AX			// sys_sigaction
+TEXT runtime·getcontext(SB),7,$-4
+	MOVL	$307, AX		// sys_getcontext
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),7,$-4
+	MOVL	$293, AX		// sys_sigprocmask
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigreturn_tramp(SB),7,$0
+	LEAL	140(SP), AX		// Load address of ucontext
+	MOVL	AX, 4(SP)
+	MOVL	$308, AX		// sys_setcontext
+	INT	$0x80
+	MOVL	$-1, 4(SP)		// Something failed...
+	MOVL	$1, AX			// sys_exit
+	INT	$0x80
+
+TEXT runtime·sigaction(SB),7,$24
+	LEAL	arg0+0(FP), SI
+	LEAL	4(SP), DI
+	CLD
+	MOVSL				// arg 1 - sig
+	MOVSL				// arg 2 - act
+	MOVSL				// arg 3 - oact
+	LEAL	runtime·sigreturn_tramp(SB), AX
+	STOSL				// arg 4 - tramp
+	MOVL	$2, AX
+	STOSL				// arg 5 - vers
+	MOVL	$340, AX		// sys___sigaction_sigtramp
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·sigtramp(SB),7,$44
@@ -141,13 +182,16 @@ TEXT runtime·sigtramp(SB),7,$44
 	// check that m exists
 	MOVL	m(CX), BX
 	CMPL	BX, $0
-	JNE	2(PC)
+	JNE	5(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVL	g(CX), DI
 	MOVL	DI, 20(SP)
-	
+
 	// g = m->gsignal
 	MOVL	m_gsignal(BX), BX
 	MOVL	BX, g(CX)
@@ -167,67 +211,24 @@ TEXT runtime·sigtramp(SB),7,$44
 	get_tls(CX)
 	MOVL	20(SP), BX
 	MOVL	BX, g(CX)
-	
-	// call sigreturn
-	MOVL	context+8(FP), AX
-	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	AX, 4(SP)		// arg 1 - sigcontext
-	MOVL	$103, AX		// sys_sigreturn
-	INT	$0x80
-	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-// int32 rfork_thread(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
-TEXT runtime·rfork_thread(SB),7,$8
-	MOVL	flags+8(SP), AX
-	MOVL	stack+12(SP), CX
-
-	// Copy m, g, fn off parent stack for use by child.
-	SUBL	$16, CX
-	MOVL	mm+16(SP), SI
-	MOVL	SI, 0(CX)
-	MOVL	gg+20(SP), SI
-	MOVL	SI, 4(CX)
-	MOVL	fn+24(SP), SI
-	MOVL	SI, 8(CX)
-	MOVL	$1234, 12(CX)
-	MOVL	CX, SI
-
-	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	AX, 4(SP)		// arg 1 - flags
-	MOVL	$251, AX		// sys_rfork
+// int32 lwp_create(void *context, uintptr flags, void *lwpid);
+TEXT runtime·lwp_create(SB),7,$16
+	MOVL	$0, 0(SP)
+	MOVL	context+0(FP), AX
+	MOVL	AX, 4(SP)		// arg 1 - context
+	MOVL	flags+4(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - flags
+	MOVL	lwpid+8(FP), AX
+	MOVL	AX, 12(SP)		// arg 3 - lwpid
+	MOVL	$309, AX		// sys__lwp_create
 	INT	$0x80
-
-	// Return if rfork syscall failed
-	JCC	4(PC)
+	JCC	2(PC)
 	NEGL	AX
-	MOVL	AX, 48(SP)
 	RET
 
-	// In parent, return.
-	CMPL	AX, $0
-	JEQ	3(PC)
-	MOVL	AX, 48(SP)
-	RET
-
-	// In child, on new stack.
-	MOVL    SI, SP
-
-	// Paranoia: check that SP is as we expect.
-	MOVL	12(SP), BP
-	CMPL	BP, $1234
-	JEQ	2(PC)
-	INT	$3
-
-	// Reload registers
-	MOVL	0(SP), BX		// m
-	MOVL	4(SP), DX		// g
-	MOVL	8(SP), SI		// fn
-
-	// Initialize m->procid to thread ID
-	MOVL	$299, AX		// sys_getthrid
-	INT	$0x80
-	MOVL	AX, m_procid(BX)
+TEXT runtime·lwp_tramp(SB),7,$0
 
 	// Set FS to point at m->tls
 	LEAL	m_tls(BX), BP
@@ -236,7 +237,7 @@ TEXT runtime·rfork_thread(SB),7,$8
 	CALL	runtime·settls(SB)
 	POPL	AX
 	POPAL
-	
+
 	// Now segment is established.  Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
@@ -259,7 +260,7 @@ TEXT runtime·rfork_thread(SB),7,$8
 	RET
 
 TEXT runtime·sigaltstack(SB),7,$-8
-	MOVL	$288, AX		// sys_sigaltstack
+	MOVL	$281, AX		// sys___sigaltstack14
 	MOVL	new+4(SP), BX
 	MOVL	old+8(SP), CX
 	INT	$0x80
@@ -277,30 +278,33 @@ TEXT runtime·setldt(SB),7,$8
 
 TEXT runtime·settls(SB),7,$16
 	// adjust for ELF: wants to use -8(GS) and -4(GS) for g and m
-	MOVL	20(SP), CX
+	MOVL	base+0(FP), CX
 	ADDL	$8, CX
-	MOVL	CX, 0(CX)
 	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	$9, 4(SP)		// I386_SET_GSBASE (machine/sysarch.h)
-	MOVL	CX, 8(SP)		// pointer to base
-	MOVL	$165, AX		// sys_sysarch
+	MOVL	CX, 4(SP)		// arg 1 - ptr
+	MOVL	$317, AX		// sys__lwp_setprivate
 	INT	$0x80
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·osyield(SB),7,$-4
-	MOVL	$298, AX		// sys_sched_yield
+	MOVL	$350, AX		// sys_sched_yield
+	INT	$0x80
+	RET
+
+TEXT runtime·lwp_park(SB),7,$-4
+	MOVL	$434, AX		// sys__lwp_park
 	INT	$0x80
 	RET
 
-TEXT runtime·thrsleep(SB),7,$-4
-	MOVL	$300, AX		// sys_thrsleep
+TEXT runtime·lwp_unpark(SB),7,$-4
+	MOVL	$321, AX		// sys__lwp_unpark
 	INT	$0x80
 	RET
 
-TEXT runtime·thrwakeup(SB),7,$-4
-	MOVL	$301, AX		// sys_thrwakeup
+TEXT runtime·lwp_self(SB),7,$-4
+	MOVL	$311, AX		// sys__lwp_self
 	INT	$0x80
 	RET
 
diff --git a/src/pkg/runtime/sys_netbsd_amd64.s b/src/pkg/runtime/sys_netbsd_amd64.s
index 0b83cd4d8..e73e83ded 100644
--- a/src/pkg/runtime/sys_netbsd_amd64.s
+++ b/src/pkg/runtime/sys_netbsd_amd64.s
@@ -8,42 +8,24 @@
 
 #include "zasm_GOOS_GOARCH.h"
 
-// int64 rfork_thread(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
-TEXT runtime·rfork_thread(SB),7,$0
-	MOVL	flags+8(SP), DI
-	MOVQ	stack+16(SP), SI
-
-	// Copy m, g, fn off parent stack for use by child.
-	MOVQ	mm+24(SP), R8
-	MOVQ	gg+32(SP), R9
-	MOVQ	fn+40(SP), R12
-
-	MOVL	$251, AX		// sys_rfork
+// int32 lwp_create(void *context, uintptr flags, void *lwpid)
+TEXT runtime·lwp_create(SB),7,$0
+	MOVQ	context+0(FP), DI
+	MOVQ	flags+8(FP), SI
+	MOVQ	lwpid+16(FP), DX
+	MOVL	$309, AX		// sys__lwp_create
 	SYSCALL
-
-	// Return if rfork syscall failed
-	JCC	3(PC)
+	JCC	2(PC)
 	NEGL	AX
 	RET
 
-	// In parent, return.
-	CMPL	AX, $0
-	JEQ	2(PC)
-	RET
-
-	// In child, on new stack.
-	MOVQ	SI, SP
-
-	// Initialize m->procid to thread ID
-	MOVL	$299, AX		// sys_getthrid
-	SYSCALL
-	MOVQ	AX, m_procid(R8)
-
+TEXT runtime·lwp_tramp(SB),7,$0
+	
 	// Set FS to point at m->tls.
 	LEAQ	m_tls(R8), DI
 	CALL	runtime·settls(SB)
 
-	// In child, set up new stack
+	// Set up new stack.
 	get_tls(CX)
 	MOVQ	R8, m(CX)
 	MOVQ	R9, g(CX)
@@ -52,29 +34,34 @@ TEXT runtime·rfork_thread(SB),7,$0
 	// Call fn
 	CALL	R12
 
-	// It shouldn't return.  If it does, exit
-	MOVL	$302, AX		// sys_threxit
+	// It shouldn't return.  If it does, exit.
+	MOVL	$310, AX		// sys__lwp_exit
 	SYSCALL
 	JMP	-3(PC)			// keep exiting
 
 TEXT runtime·osyield(SB),7,$0
-	MOVL $298, AX			// sys_sched_yield
+	MOVL	$350, AX		// sys_sched_yield
 	SYSCALL
 	RET
 
-TEXT runtime·thrsleep(SB),7,$0
-	MOVQ	8(SP), DI		// arg 1 - ident
-	MOVL	16(SP), SI		// arg 2 - clock_id
-	MOVQ	24(SP), DX		// arg 3 - tp
-	MOVQ	32(SP), R10		// arg 4 - lock
-	MOVL	$300, AX		// sys_thrsleep
+TEXT runtime·lwp_park(SB),7,$0
+	MOVQ	8(SP), DI		// arg 1 - abstime
+	MOVL	16(SP), SI		// arg 2 - unpark
+	MOVQ	24(SP), DX		// arg 3 - hint
+	MOVQ	32(SP), R10		// arg 4 - unparkhint
+	MOVL	$434, AX		// sys__lwp_park
 	SYSCALL
 	RET
 
-TEXT runtime·thrwakeup(SB),7,$0
-	MOVQ	8(SP), DI		// arg 1 - ident
-	MOVL	16(SP), SI		// arg 2 - n
-	MOVL	$301, AX		// sys_thrwakeup
+TEXT runtime·lwp_unpark(SB),7,$0
+	MOVQ	8(SP), DI		// arg 1 - lwp
+	MOVL	16(SP), SI		// arg 2 - hint
+	MOVL	$321, AX		// sys__lwp_unpark
+	SYSCALL
+	RET
+
+TEXT runtime·lwp_self(SB),7,$0
+	MOVL	$311, AX		// sys__lwp_self
 	SYSCALL
 	RET
 
@@ -83,13 +70,13 @@ TEXT runtime·exit(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 - exit status
 	MOVL	$1, AX			// sys_exit
 	SYSCALL
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·exit1(SB),7,$-8
-	MOVL	$302, AX		// sys_threxit
+	MOVL	$310, AX		// sys__lwp_exit
 	SYSCALL
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·write(SB),7,$-8
@@ -112,16 +99,16 @@ TEXT runtime·usleep(SB),7,$16
 
 	MOVQ	SP, DI			// arg 1 - rqtp
 	MOVQ	$0, SI			// arg 2 - rmtp
-	MOVL	$240, AX		// sys_nanosleep
+	MOVL	$430, AX		// sys_nanosleep
 	SYSCALL
 	RET
 
 TEXT runtime·raisesigpipe(SB),7,$16
-	MOVL	$299, AX		// sys_getthrid
+	MOVL	$311, AX		// sys__lwp_self
 	SYSCALL
-	MOVQ	AX, DI			// arg 1 - pid
-	MOVQ	$13, SI			// arg 2 - signum == SIGPIPE
-	MOVL	$37, AX			// sys_kill
+	MOVQ	AX, DI			// arg 1 - target
+	MOVQ	$13, SI			// arg 2 - signo == SIGPIPE
+	MOVL	$318, AX		// sys__lwp_kill
 	SYSCALL
 	RET
 
@@ -129,72 +116,101 @@ TEXT runtime·setitimer(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 - which
 	MOVQ	16(SP), SI		// arg 2 - itv
 	MOVQ	24(SP), DX		// arg 3 - oitv
-	MOVL	$83, AX			// sys_setitimer
+	MOVL	$425, AX		// sys_setitimer
 	SYSCALL
 	RET
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	LEAQ	8(SP), DI		// arg 1 - tp
-	MOVQ	$0, SI			// arg 2 - tzp
-	MOVL	$116, AX		// sys_gettimeofday
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
 	SYSCALL
 	MOVQ	8(SP), AX		// sec
-	MOVL	16(SP), DX	// usec
+	MOVL	16(SP), DX		// nsec
 
-	// sec is in AX, usec in DX
+	// sec is in AX, nsec in DX
 	MOVQ	AX, sec+0(FP)
-	IMULQ	$1000, DX
 	MOVL	DX, nsec+8(FP)
 	RET
 
 TEXT runtime·nanotime(SB),7,$32
-	LEAQ	8(SP), DI		// arg 1 - tp
-	MOVQ	$0, SI			// arg 2 - tzp
-	MOVL	$116, AX		// sys_gettimeofday
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$427, AX		// sys_clock_gettime
 	SYSCALL
 	MOVQ	8(SP), AX		// sec
-	MOVL	16(SP), DX	// usec
+	MOVL	16(SP), DX		// nsec
 
-	// sec is in AX, usec in DX
+	// sec is in AX, nsec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
-	IMULQ	$1000, DX
 	ADDQ	DX, AX
 	RET
 
+TEXT runtime·getcontext(SB),7,$-8
+	MOVQ	8(SP), DI		// arg 1 - context
+	MOVL	$307, AX		// sys_getcontext
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),7,$0
+	MOVL	8(SP), DI		// arg 1 - how
+	MOVQ	16(SP), SI		// arg 2 - set
+	MOVQ	24(SP), DX		// arg 3 - oset
+	MOVL	$293, AX		// sys_sigprocmask
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigreturn_tramp(SB),7,$-8
+	MOVQ	R15, DI			// Load address of ucontext
+	MOVQ	$308, AX		// sys_setcontext
+	SYSCALL
+	MOVQ	$-1, DI			// Something failed...
+	MOVL	$1, AX			// sys_exit
+	SYSCALL
+
 TEXT runtime·sigaction(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 - signum
 	MOVQ	16(SP), SI		// arg 2 - nsa
 	MOVQ	24(SP), DX		// arg 3 - osa
-	MOVL	$46, AX
+					// arg 4 - tramp
+	LEAQ	runtime·sigreturn_tramp(SB), R10
+	MOVQ	$2, R8			// arg 5 - vers
+	MOVL	$340, AX		// sys___sigaction_sigtramp
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·sigtramp(SB),7,$64
 	get_tls(BX)
-	
+
 	// check that m exists
 	MOVQ	m(BX), BP
 	CMPQ	BP, $0
-	JNE	2(PC)
+	JNE	4(PC)
+	MOVQ	DI, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVQ	g(BX), R10
 	MOVQ	R10, 40(SP)
-	
+
 	// g = m->signal
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
-	
+
 	MOVQ	DI, 0(SP)
 	MOVQ	SI, 8(SP)
 	MOVQ	DX, 16(SP)
 	MOVQ	R10, 24(SP)
-	
+
 	CALL	runtime·sighandler(SB)
 
 	// restore g
@@ -213,7 +229,7 @@ TEXT runtime·mmap(SB),7,$0
 	SUBQ	$16, SP
 	MOVQ	R9, 8(SP)		// arg 7 - offset (passed on stack)
 	MOVQ	$0, R9			// arg 6 - pad
-	MOVL	$197, AX
+	MOVL	$197, AX		// sys_mmap
 	SYSCALL
 	JCC	2(PC)
 	NEGL	AX
@@ -226,29 +242,36 @@ TEXT runtime·munmap(SB),7,$0
 	MOVL	$73, AX			// sys_munmap
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+
+TEXT runtime·madvise(SB),7,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	len+8(FP), SI		// arg 2 - len
+	MOVQ	behav+16(FP), DX	// arg 3 - behav
+	MOVQ	$75, AX			// sys_madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
 	RET
 
 TEXT runtime·sigaltstack(SB),7,$-8
 	MOVQ	new+8(SP), DI		// arg 1 - nss
 	MOVQ	old+16(SP), SI		// arg 2 - oss
-	MOVQ	$288, AX		// sys_sigaltstack
+	MOVQ	$281, AX		// sys___sigaltstack14
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 // set tls base to DI
 TEXT runtime·settls(SB),7,$8
 	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
-	ADDQ	$16, DI
-	MOVQ	DI, 0(SP)
-	MOVQ	SP, SI
-	MOVQ	$12, DI			// AMD64_SET_FSBASE (machine/sysarch.h)
-	MOVQ	$165, AX		// sys_sysarch
+	ADDQ	$16, DI			// arg 1 - ptr
+	MOVQ	$317, AX		// sys__lwp_setprivate
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·sysctl(SB),7,$0
diff --git a/src/pkg/runtime/sys_netbsd_arm.s b/src/pkg/runtime/sys_netbsd_arm.s
new file mode 100644
index 000000000..4a119c5de
--- /dev/null
+++ b/src/pkg/runtime/sys_netbsd_arm.s
@@ -0,0 +1,280 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// System calls and other sys.stuff for ARM, NetBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "zasm_GOOS_GOARCH.h"
+
+// Exit the entire program (like C exit)
+TEXT runtime·exit(SB),7,$-4
+	MOVW 0(FP), R0	// arg 1 exit status
+	SWI $0xa00001
+	MOVW.CS $0, R9	// crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·exit1(SB),7,$-4
+	SWI $0xa00136	// sys__lwp_exit
+	MOVW $1, R9	// crash
+	MOVW R9, (R9)
+	RET
+
+TEXT runtime·write(SB),7,$-4
+	MOVW	0(FP), R0	// arg 1 - fd
+	MOVW	4(FP), R1	// arg 2 - buf
+	MOVW	8(FP), R2	// arg 3 - nbyte
+	SWI $0xa00004	// sys_write
+	RET
+
+// int32 lwp_create(void *context, uintptr flags, void *lwpid)
+TEXT runtime·lwp_create(SB),7,$0
+	MOVW context+0(FP), R0
+	MOVW flags+4(FP), R1
+	MOVW lwpid+8(FP), R2
+	SWI $0xa00135	// sys__lwp_create
+	RET
+
+TEXT runtime·osyield(SB),7,$0
+	SWI $0xa0015e	// sys_sched_yield
+	RET
+
+TEXT runtime·lwp_park(SB),7,$0
+	MOVW 0(FP), R0	// arg 1 - abstime
+	MOVW 4(FP), R1	// arg 2 - unpark
+	MOVW 8(FP), R2	// arg 3 - hint
+	MOVW 12(FP), R3	// arg 4 - unparkhint
+	SWI $0xa001b2	// sys__lwp_park
+	RET
+
+TEXT runtime·lwp_unpark(SB),7,$0
+	MOVW	0(FP), R0	// arg 1 - lwp
+	MOVW	4(FP), R1	// arg 2 - hint
+	SWI $0xa00141 // sys__lwp_unpark
+	RET
+
+TEXT runtime·lwp_self(SB),7,$0
+	SWI $0xa00137	// sys__lwp_self
+	RET
+
+TEXT runtime·lwp_tramp(SB),7,$0
+	MOVW R0, R9 // m
+	MOVW R1, R10 // g
+
+	BL runtime·emptyfunc(SB) // fault if stack check is wrong
+	BL (R2)
+	MOVW $2, R9  // crash (not reached)
+	MOVW R9, (R9)
+	RET
+
+TEXT runtime·usleep(SB),7,$16
+	MOVW usec+0(FP), R0
+	MOVW R0, R2
+	MOVW $1000000, R1
+	DIV R1, R0
+	// 0(R13) is the saved LR, don't use it
+	MOVW R0, 4(R13) // tv_sec.low
+	MOVW $0, R0
+	MOVW R0, 8(R13) // tv_sec.high
+	MOD R1, R2
+	MOVW $1000, R1
+	MUL R1, R2
+	MOVW R2, 12(R13) // tv_nsec
+
+	MOVW $4(R13), R0 // arg 1 - rqtp
+	MOVW $0, R1      // arg 2 - rmtp
+	SWI $0xa001ae	// sys_nanosleep
+	RET
+
+TEXT runtime·raisesigpipe(SB),7,$16
+	SWI $0xa00137	// sys__lwp_self, the returned R0 is arg 1
+	MOVW $13, R1	// arg 2 - signo == SIGPIPE
+	SWI $0xa0013e	// sys__lwp_kill
+	RET
+
+TEXT runtime·setitimer(SB),7,$-4
+	MOVW 0(FP), R0	// arg 1 - which
+	MOVW 4(FP), R1	// arg 2 - itv
+	MOVW 8(FP), R2	// arg 3 - oitv
+	SWI $0xa001a9	// sys_setitimer
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB), 7, $32
+	MOVW $0, R0	// CLOCK_REALTIME
+	MOVW $8(R13), R1
+	SWI $0xa001ab	// clock_gettime
+
+	MOVW 8(R13), R0	// sec.low
+	MOVW 12(R13), R1 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW R0, 0(FP)
+	MOVW R1, 4(FP)
+	MOVW R2, 8(FP)
+	RET
+
+// int64 nanotime(void) so really
+// void nanotime(int64 *nsec)
+TEXT runtime·nanotime(SB), 7, $32
+	MOVW $0, R0 // CLOCK_REALTIME
+	MOVW $8(R13), R1
+	SWI $0xa001ab	// clock_gettime
+
+	MOVW 8(R13), R0 // sec.low
+	MOVW 12(R13), R4 // sec.high
+	MOVW 16(R13), R2 // nsec
+
+	MOVW $1000000000, R3
+	MULLU R0, R3, (R1, R0)
+	MUL R3, R4
+	ADD.S R2, R0
+	ADC R4, R1
+
+	MOVW 0(FP), R3
+	MOVW R0, 0(R3)
+	MOVW R1, 4(R3)
+	RET
+
+TEXT runtime·getcontext(SB),7,$-4
+	MOVW 0(FP), R0	// arg 1 - context
+	SWI $0xa00133	// sys_getcontext
+	MOVW.CS $0, R9	// crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·sigprocmask(SB),7,$0
+	MOVW 0(FP), R0	// arg 1 - how
+	MOVW 4(FP), R1	// arg 2 - set
+	MOVW 8(FP), R2	// arg 3 - oset
+	SWI $0xa00125	// sys_sigprocmask
+	MOVW.CS $0, R9	// crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·sigreturn_tramp(SB),7,$-4
+	// in runtime·sigtramp, we saved ucontext into m->tls[0],
+	// here we just load it and call sys_setcontext
+	MOVW m_tls(m), R0
+	SWI $0xa00134	// sys_setcontext
+	// something failed, we have to exit
+	MOVW $0x4242, R0 // magic return number
+	SWI $0xa00001	// sys_exit
+	B -2(PC)	// continue exit
+
+TEXT runtime·sigaction(SB),7,$4
+	MOVW 0(FP), R0	// arg 1 - signum
+	MOVW 4(FP), R1	// arg 2 - nsa
+	MOVW 8(FP), R2	// arg 3 - osa
+	MOVW $runtime·sigreturn_tramp(SB), R3	// arg 4 - tramp
+	MOVW $2, R4	// arg 5 - vers
+	MOVW R4, 4(R13)
+	ADD $4, R13	// pass arg 5 on stack
+	SWI $0xa00154	// sys___sigaction_sigtramp
+	SUB $4, R13
+	MOVW.CS $3, R9	// crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·sigtramp(SB),7,$24
+	// this might be called in external code context,
+	// where g and m are not set.
+	// first save R0, because _cgo_load_gm will clobber it
+	// TODO(adonovan): call runtime·badsignal if m=0, like other platforms?
+	MOVW	R0, 4(R13) // signum
+	MOVW	_cgo_load_gm(SB), R0
+	CMP 	$0, R0
+	BL.NE	(R0)
+
+	// save g
+	MOVW R10, R4
+	MOVW R10, 20(R13)
+
+	// g = m->signal
+	MOVW m_gsignal(R9), R10
+
+	// R0 is already saved
+	MOVW R1, 8(R13) // info
+	MOVW R2, 12(R13) // context
+	MOVW R4, 16(R13) // gp
+	// we also save the ucontext into m->tls[0] for easy
+	// signal return
+	MOVW R2, m_tls(m)
+
+	BL runtime·sighandler(SB)
+
+	// restore g
+	MOVW 20(R13), R10
+	RET
+
+TEXT runtime·mmap(SB),7,$12
+	MOVW 0(FP), R0	// arg 1 - addr
+	MOVW 4(FP), R1	// arg 2 - len
+	MOVW 8(FP), R2	// arg 3 - prot
+	MOVW 12(FP), R3	// arg 4 - flags
+	// arg 5 (fid) and arg6 (offset_lo, offset_hi) are passed on stack
+	// note the C runtime only passes the 32-bit offset_lo to us
+	MOVW 16(FP), R4		// arg 5
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R5		// arg 6 lower 32-bit
+	MOVW R5, 8(R13)
+	MOVW $0, R6 // higher 32-bit for arg 6
+	MOVW R6, 12(R13)
+	ADD $4, R13 // pass arg 5 and arg 6 on stack
+	SWI $0xa000c5	// sys_mmap
+	SUB $4, R13
+	RET
+
+TEXT runtime·munmap(SB),7,$0
+	MOVW 0(FP), R0	// arg 1 - addr
+	MOVW 4(FP), R1	// arg 2 - len
+	SWI $0xa00049	// sys_munmap
+	MOVW.CS $0, R9	// crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·madvise(SB),7,$0
+	MOVW 0(FP), R0	// arg 1 - addr
+	MOVW 4(FP), R1	// arg 2 - len
+	MOVW 8(FP), R2	// arg 3 - behav
+	SWI $0xa0004b	// sys_madvise
+	// ignore failure - maybe pages are locked
+	RET
+
+TEXT runtime·sigaltstack(SB),7,$-4
+	MOVW 0(FP), R0	// arg 1 - nss
+	MOVW 4(FP), R1	// arg 2 - oss
+	SWI $0xa00119	// sys___sigaltstack14
+	MOVW.CS $0, R9	// crash on syscall failure
+	MOVW.CS R9, (R9)
+	RET
+
+TEXT runtime·sysctl(SB),7,$8
+	MOVW 0(FP), R0	// arg 1 - name
+	MOVW 4(FP), R1	// arg 2 - namelen
+	MOVW 8(FP), R2	// arg 3 - oldp
+	MOVW 12(FP), R3	// arg 4 - oldlenp
+	MOVW 16(FP), R4	// arg 5 - newp
+	MOVW R4, 4(R13)
+	MOVW 20(FP), R4	// arg 6 - newlen
+	MOVW R4, 8(R13)
+	ADD $4, R13	// pass arg 5 and 6 on stack
+	SWI $0xa000ca	// sys___sysctl
+	SUB $4, R13
+	RET
+
+TEXT runtime·casp(SB),7,$0
+	B	runtime·cas(SB)
+
+// TODO(minux): this is only valid for ARMv6+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime·cas(SB),7,$0
+	B runtime·armcas(SB)
diff --git a/src/pkg/runtime/sys_openbsd_386.s b/src/pkg/runtime/sys_openbsd_386.s
index 593b4a9df..c62e0f949 100644
--- a/src/pkg/runtime/sys_openbsd_386.s
+++ b/src/pkg/runtime/sys_openbsd_386.s
@@ -12,14 +12,16 @@
 TEXT runtime·exit(SB),7,$-4
 	MOVL	$1, AX
 	INT	$0x80
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·exit1(SB),7,$-4
-	MOVL	$302, AX		// sys_threxit
+TEXT runtime·exit1(SB),7,$8
+	MOVL	$0, 0(SP)
+	MOVL	$0, 4(SP)		// arg 1 - notdead
+	MOVL	$302, AX		// sys___threxit
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·write(SB),7,$-4
@@ -67,7 +69,7 @@ TEXT runtime·mmap(SB),7,$36
 	MOVL	$0, AX
 	STOSL				// arg 6 - pad
 	MOVSL				// arg 7 - offset
-	MOVL	$0, AX			// top 64 bits of file offset
+	MOVL	$0, AX			// top 32 bits of file offset
 	STOSL
 	MOVL	$197, AX		// sys_mmap
 	INT	$0x80
@@ -79,7 +81,14 @@ TEXT runtime·munmap(SB),7,$-4
 	MOVL	$73, AX			// sys_munmap
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·madvise(SB),7,$-4
+	MOVL	$75, AX			// sys_madvise
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·setitimer(SB),7,$-4
@@ -89,40 +98,38 @@ TEXT runtime·setitimer(SB),7,$-4
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	MOVL	$116, AX
+	MOVL	$232, AX
 	LEAL	12(SP), BX
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)
+	MOVL	$0, 4(SP)
+	MOVL	BX, 8(SP)
 	INT	$0x80
 	MOVL	12(SP), AX		// sec
-	MOVL	16(SP), BX		// usec
+	MOVL	16(SP), BX		// nsec
 
-	// sec is in AX, usec in BX
+	// sec is in AX, nsec in BX
 	MOVL	AX, sec+0(FP)
 	MOVL	$0, sec+4(FP)
-	IMULL	$1000, BX
 	MOVL	BX, nsec+8(FP)
 	RET
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
 TEXT runtime·nanotime(SB),7,$32
-	MOVL	$116, AX
+	MOVL	$232, AX
 	LEAL	12(SP), BX
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)
+	MOVL	$0, 4(SP)
+	MOVL	BX, 8(SP)
 	INT	$0x80
 	MOVL	12(SP), AX		// sec
-	MOVL	16(SP), BX		// usec
+	MOVL	16(SP), BX		// nsec
 
-	// sec is in AX, usec in BX
+	// sec is in AX, nsec in BX
 	// convert to DX:AX nsec
 	MOVL	$1000000000, CX
 	MULL	CX
-	IMULL	$1000, BX
 	ADDL	BX, AX
 	ADCL	$0, DX
-	
+
 	MOVL	ret+0(FP), DI
 	MOVL	AX, 0(DI)
 	MOVL	DX, 4(DI)
@@ -132,7 +139,15 @@ TEXT runtime·sigaction(SB),7,$-4
 	MOVL	$46, AX			// sys_sigaction
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),7,$-4
+	MOVL	$48, AX			// sys_sigprocmask
+	INT	$0x80
+	JAE	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	MOVL	AX, oset+0(FP)
 	RET
 
 TEXT runtime·sigtramp(SB),7,$44
@@ -141,8 +156,11 @@ TEXT runtime·sigtramp(SB),7,$44
 	// check that m exists
 	MOVL	m(CX), BX
 	CMPL	BX, $0
-	JNE	2(PC)
+	JNE	5(PC)
+	MOVL	signo+0(FP), BX
+	MOVL	BX, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVL	g(CX), DI
@@ -174,62 +192,59 @@ TEXT runtime·sigtramp(SB),7,$44
 	MOVL	AX, 4(SP)		// arg 1 - sigcontext
 	MOVL	$103, AX		// sys_sigreturn
 	INT	$0x80
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-// int32 rfork_thread(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
-TEXT runtime·rfork_thread(SB),7,$8
-	MOVL	flags+8(SP), AX
-	MOVL	stack+12(SP), CX
+// int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·tfork(SB),7,$12
 
-	// Copy m, g, fn off parent stack for use by child.
+	// Copy mp, gp and fn from the parent stack onto the child stack.
+	MOVL	params+4(FP), AX
+	MOVL	8(AX), CX		// tf_stack
 	SUBL	$16, CX
-	MOVL	mm+16(SP), SI
+	MOVL	CX, 8(AX)
+	MOVL	mm+12(FP), SI
 	MOVL	SI, 0(CX)
-	MOVL	gg+20(SP), SI
+	MOVL	gg+16(FP), SI
 	MOVL	SI, 4(CX)
-	MOVL	fn+24(SP), SI
+	MOVL	fn+20(FP), SI
 	MOVL	SI, 8(CX)
 	MOVL	$1234, 12(CX)
-	MOVL	CX, SI
 
 	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	AX, 4(SP)		// arg 1 - flags
-	MOVL	$251, AX		// sys_rfork
+	MOVL	params+4(FP), AX
+	MOVL	AX, 4(SP)		// arg 1 - param
+	MOVL	psize+8(FP), AX
+	MOVL	AX, 8(SP)		// arg 2 - psize
+	MOVL	$8, AX			// sys___tfork
 	INT	$0x80
 
-	// Return if rfork syscall failed
-	JCC	4(PC)
+	// Return if tfork syscall failed.
+	JCC	5(PC)
 	NEGL	AX
-	MOVL	AX, 48(SP)
+	MOVL	ret+0(FP), DX
+	MOVL	AX, 0(DX)
 	RET
 
 	// In parent, return.
 	CMPL	AX, $0
-	JEQ	3(PC)
-	MOVL	AX, 48(SP)
+	JEQ	4(PC)
+	MOVL	ret+0(FP), DX
+	MOVL	AX, 0(DX)
 	RET
 
-	// In child, on new stack.
-	MOVL    SI, SP
-
 	// Paranoia: check that SP is as we expect.
 	MOVL	12(SP), BP
 	CMPL	BP, $1234
 	JEQ	2(PC)
 	INT	$3
 
-	// Reload registers
+	// Reload registers.
 	MOVL	0(SP), BX		// m
 	MOVL	4(SP), DX		// g
 	MOVL	8(SP), SI		// fn
 
-	// Initialize m->procid to thread ID
-	MOVL	$299, AX		// sys_getthrid
-	INT	$0x80
-	MOVL	AX, m_procid(BX)
-
-	// Set FS to point at m->tls
+	// Set FS to point at m->tls.
 	LEAL	m_tls(BX), BP
 	PUSHAL				// save registers
 	PUSHL	BP
@@ -246,12 +261,12 @@ TEXT runtime·rfork_thread(SB),7,$8
 	MOVL	0(DX), DX		// paranoia; check they are not nil
 	MOVL	0(BX), BX
 
-	// more paranoia; check that stack splitting code works
+	// More paranoia; check that stack splitting code works.
 	PUSHAL
 	CALL	runtime·emptyfunc(SB)
 	POPAL
 
-	// Call fn
+	// Call fn.
 	CALL	SI
 
 	CALL	runtime·exit1(SB)
@@ -268,25 +283,23 @@ TEXT runtime·sigaltstack(SB),7,$-8
 	INT	$3
 	RET
 
-TEXT runtime·setldt(SB),7,$8
+TEXT runtime·setldt(SB),7,$4
 	// Under OpenBSD we set the GS base instead of messing with the LDT.
-	MOVL	16(SP), AX		// tls0
+	MOVL	tls0+4(FP), AX
 	MOVL	AX, 0(SP)
 	CALL	runtime·settls(SB)
 	RET
 
-TEXT runtime·settls(SB),7,$16
+TEXT runtime·settls(SB),7,$8
 	// adjust for ELF: wants to use -8(GS) and -4(GS) for g and m
-	MOVL	20(SP), CX
+	MOVL	tlsbase+0(FP), CX
 	ADDL	$8, CX
-	MOVL	CX, 0(CX)
 	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	$9, 4(SP)		// I386_SET_GSBASE (machine/sysarch.h)
-	MOVL	CX, 8(SP)		// pointer to base
-	MOVL	$165, AX		// sys_sysarch
+	MOVL	CX, 4(SP)		// arg 1 - tcb
+	MOVL	$329, AX		// sys___set_tcb
 	INT	$0x80
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·osyield(SB),7,$-4
@@ -295,12 +308,12 @@ TEXT runtime·osyield(SB),7,$-4
 	RET
 
 TEXT runtime·thrsleep(SB),7,$-4
-	MOVL	$300, AX		// sys_thrsleep
+	MOVL	$300, AX		// sys___thrsleep
 	INT	$0x80
 	RET
 
 TEXT runtime·thrwakeup(SB),7,$-4
-	MOVL	$301, AX		// sys_thrwakeup
+	MOVL	$301, AX		// sys___thrwakeup
 	INT	$0x80
 	RET
 
diff --git a/src/pkg/runtime/sys_openbsd_amd64.s b/src/pkg/runtime/sys_openbsd_amd64.s
index d2d48e6b5..8a736507f 100644
--- a/src/pkg/runtime/sys_openbsd_amd64.s
+++ b/src/pkg/runtime/sys_openbsd_amd64.s
@@ -8,20 +8,20 @@
 
 #include "zasm_GOOS_GOARCH.h"
 
-// int64 rfork_thread(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
-TEXT runtime·rfork_thread(SB),7,$0
-	MOVL	flags+8(SP), DI
-	MOVQ	stack+16(SP), SI
+// int64 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·tfork(SB),7,$32
 
-	// Copy m, g, fn off parent stack for use by child.
-	MOVQ	mm+24(SP), R8
-	MOVQ	gg+32(SP), R9
-	MOVQ	fn+40(SP), R12
+	// Copy mp, gp and fn off parent stack for use by child.
+	MOVQ	mm+16(FP), R8
+	MOVQ	gg+24(FP), R9
+	MOVQ	fn+32(FP), R12
 
-	MOVL	$251, AX		// sys_rfork
+	MOVQ	param+0(FP), DI
+	MOVQ	psize+8(FP), SI
+	MOVL	$8, AX			// sys___tfork
 	SYSCALL
 
-	// Return if rfork syscall failed
+	// Return if tfork syscall failed.
 	JCC	3(PC)
 	NEGL	AX
 	RET
@@ -31,19 +31,11 @@ TEXT runtime·rfork_thread(SB),7,$0
 	JEQ	2(PC)
 	RET
 
-	// In child, on new stack.
-	MOVQ	SI, SP
-
-	// Initialize m->procid to thread ID
-	MOVL	$299, AX		// sys_getthrid
-	SYSCALL
-	MOVQ	AX, m_procid(R8)
-
 	// Set FS to point at m->tls.
 	LEAQ	m_tls(R8), DI
 	CALL	runtime·settls(SB)
 
-	// In child, set up new stack
+	// In child, set up new stack.
 	get_tls(CX)
 	MOVQ	R8, m(CX)
 	MOVQ	R9, g(CX)
@@ -53,12 +45,13 @@ TEXT runtime·rfork_thread(SB),7,$0
 	CALL	R12
 
 	// It shouldn't return.  If it does, exit
-	MOVL	$302, AX		// sys_threxit
+	MOVQ	$0, DI			// arg 1 - notdead
+	MOVL	$302, AX		// sys___threxit
 	SYSCALL
 	JMP	-3(PC)			// keep exiting
 
 TEXT runtime·osyield(SB),7,$0
-	MOVL $298, AX			// sys_sched_yield
+	MOVL	$298, AX		// sys_sched_yield
 	SYSCALL
 	RET
 
@@ -67,14 +60,15 @@ TEXT runtime·thrsleep(SB),7,$0
 	MOVL	16(SP), SI		// arg 2 - clock_id
 	MOVQ	24(SP), DX		// arg 3 - tp
 	MOVQ	32(SP), R10		// arg 4 - lock
-	MOVL	$300, AX		// sys_thrsleep
+	MOVQ	40(SP), R8		// arg 5 - abort
+	MOVL	$300, AX		// sys___thrsleep
 	SYSCALL
 	RET
 
 TEXT runtime·thrwakeup(SB),7,$0
 	MOVQ	8(SP), DI		// arg 1 - ident
 	MOVL	16(SP), SI		// arg 2 - n
-	MOVL	$301, AX		// sys_thrwakeup
+	MOVL	$301, AX		// sys___thrwakeup
 	SYSCALL
 	RET
 
@@ -83,13 +77,14 @@ TEXT runtime·exit(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 - exit status
 	MOVL	$1, AX			// sys_exit
 	SYSCALL
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·exit1(SB),7,$-8
-	MOVL	$302, AX		// sys_threxit
+	MOVQ	$0, DI			// arg 1 - notdead
+	MOVL	$302, AX		// sys___threxit
 	SYSCALL
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·write(SB),7,$-8
@@ -135,31 +130,29 @@ TEXT runtime·setitimer(SB),7,$-8
 
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB), 7, $32
-	LEAQ	8(SP), DI		// arg 1 - tp
-	MOVQ	$0, SI			// arg 2 - tzp
-	MOVL	$116, AX		// sys_gettimeofday
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$232, AX		// sys_clock_gettime
 	SYSCALL
-	MOVQ	8(SP), AX		// sec
-	MOVL	16(SP), DX	// usec
+	MOVL	8(SP), AX		// sec
+	MOVQ	16(SP), DX		// nsec
 
-	// sec is in AX, usec in DX
+	// sec is in AX, nsec in DX
 	MOVQ	AX, sec+0(FP)
-	IMULQ	$1000, DX
 	MOVL	DX, nsec+8(FP)
 	RET
 
 TEXT runtime·nanotime(SB),7,$32
-	LEAQ	8(SP), DI		// arg 1 - tp
-	MOVQ	$0, SI			// arg 2 - tzp
-	MOVL	$116, AX		// sys_gettimeofday
+	MOVQ	$0, DI			// arg 1 - clock_id
+	LEAQ	8(SP), SI		// arg 2 - tp
+	MOVL	$232, AX		// sys_clock_gettime
 	SYSCALL
-	MOVQ	8(SP), AX		// sec
-	MOVL	16(SP), DX	// usec
+	MOVL	8(SP), AX		// sec
+	MOVQ	16(SP), DX		// nsec
 
-	// sec is in AX, usec in DX
+	// sec is in AX, nsec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
-	IMULQ	$1000, DX
 	ADDQ	DX, AX
 	RET
 
@@ -170,7 +163,17 @@ TEXT runtime·sigaction(SB),7,$-8
 	MOVL	$46, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·sigprocmask(SB),7,$0
+	MOVL	8(SP), DI		// arg 1 - how
+	MOVL	12(SP), SI		// arg 2 - set
+	MOVL	$48, AX			// sys_sigprocmask
+	SYSCALL
+	JCC	2(PC)
+	MOVL	$0xf1, 0xf1		// crash
+	MOVL	AX, oset+0(FP)		// Return oset
 	RET
 
 TEXT runtime·sigtramp(SB),7,$64
@@ -179,8 +182,10 @@ TEXT runtime·sigtramp(SB),7,$64
 	// check that m exists
 	MOVQ	m(BX), BP
 	CMPQ	BP, $0
-	JNE	2(PC)
+	JNE	4(PC)
+	MOVQ	DI, 0(SP)
 	CALL	runtime·badsignal(SB)
+	RET
 
 	// save g
 	MOVQ	g(BX), R10
@@ -226,7 +231,16 @@ TEXT runtime·munmap(SB),7,$0
 	MOVL	$73, AX			// sys_munmap
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
+	RET
+
+TEXT runtime·madvise(SB),7,$0
+	MOVQ	addr+0(FP), DI		// arg 1 - addr
+	MOVQ	len+8(FP), SI		// arg 2 - len
+	MOVQ	behav+16(FP), DX	// arg 3 - behav
+	MOVQ	$75, AX			// sys_madvise
+	SYSCALL
+	// ignore failure - maybe pages are locked
 	RET
 
 TEXT runtime·sigaltstack(SB),7,$-8
@@ -235,20 +249,17 @@ TEXT runtime·sigaltstack(SB),7,$-8
 	MOVQ	$288, AX		// sys_sigaltstack
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 // set tls base to DI
-TEXT runtime·settls(SB),7,$8
+TEXT runtime·settls(SB),7,$0
 	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
 	ADDQ	$16, DI
-	MOVQ	DI, 0(SP)
-	MOVQ	SP, SI
-	MOVQ	$12, DI			// AMD64_SET_FSBASE (machine/sysarch.h)
-	MOVQ	$165, AX		// sys_sysarch
+	MOVQ	$329, AX		// sys___settcb
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
+	MOVL	$0xf1, 0xf1		// crash
 	RET
 
 TEXT runtime·sysctl(SB),7,$0
@@ -260,7 +271,7 @@ TEXT runtime·sysctl(SB),7,$0
 	MOVQ	48(SP), R9		// arg 6 - newlen
 	MOVQ	$202, AX		// sys___sysctl
 	SYSCALL
-	JCC 3(PC)
+	JCC	3(PC)
 	NEGL	AX
 	RET
 	MOVL	$0, AX
diff --git a/src/pkg/runtime/sys_plan9_386.s b/src/pkg/runtime/sys_plan9_386.s
index 94c36aa41..3385b083a 100644
--- a/src/pkg/runtime/sys_plan9_386.s
+++ b/src/pkg/runtime/sys_plan9_386.s
@@ -2,7 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "defs_GOOS_GOARCH.h"
 #include "zasm_GOOS_GOARCH.h"
 
 // setldt(int entry, int address, int limit)
@@ -24,9 +23,19 @@ TEXT runtime·pwrite(SB),7,$0
 	INT     $64
 	RET
 
+TEXT runtime·seek(SB),7,$0
+	MOVL	$39, AX
+	INT	$64
+	CMPL	AX, $-1
+	JNE	4(PC)
+	MOVL	a+0(FP), CX
+	MOVL	AX, 0(CX)
+	MOVL	AX, 4(CX)
+	RET
+
 TEXT runtime·close(SB),7,$0
 	MOVL	$4, AX
-	INT		$64
+	INT	$64
 	RET
 
 TEXT runtime·exits(SB),7,$0
@@ -48,6 +57,21 @@ TEXT runtime·plan9_semacquire(SB),7,$0
 	MOVL	$37, AX
 	INT	$64
 	RET
+
+TEXT runtime·plan9_tsemacquire(SB),7,$0
+	MOVL	$52, AX
+	INT	$64
+	RET
+
+TEXT runtime·notify(SB),7,$0
+	MOVL	$28, AX
+	INT	$64
+	RET
+
+TEXT runtime·noted(SB),7,$0
+	MOVL	$29, AX
+	INT	$64
+	RET
 	
 TEXT runtime·plan9_semrelease(SB),7,$0
 	MOVL	$38, AX
@@ -77,9 +101,8 @@ TEXT runtime·rfork(SB),7,$0
 	MOVL	DX, g(AX)
 	MOVL	BX, m(AX)
 
-	// Initialize AX from _tos->pid
-	MOVL	_tos(SB), AX
-	MOVL	tos_pid(AX), AX
+	// Initialize AX from TOS struct.
+	MOVL	procid(AX), AX
 	MOVL	AX, m_procid(BX)	// save pid as m->procid
 	
 	CALL	runtime·stackcheck(SB)	// smashes AX, CX
@@ -95,3 +118,55 @@ TEXT runtime·rfork(SB),7,$0
 	CALL	SI	// fn()
 	CALL	runtime·exit(SB)
 	RET
+
+// void sigtramp(void *ureg, int8 *note)
+TEXT runtime·sigtramp(SB),7,$0
+	get_tls(AX)
+
+	// check that m exists
+	MOVL	m(AX), BX
+	CMPL	BX, $0
+	JNE	3(PC)
+	CALL	runtime·badsignal(SB) // will exit
+	RET
+
+	// save args
+	MOVL	ureg+4(SP), CX
+	MOVL	note+8(SP), DX
+
+	// change stack
+	MOVL	m_gsignal(BX), BP
+	MOVL	g_stackbase(BP), BP
+	MOVL	BP, SP
+
+	// make room for args and g
+	SUBL	$16, SP
+
+	// save g
+	MOVL	g(AX), BP
+	MOVL	BP, 12(SP)
+
+	// g = m->gsignal
+	MOVL	m_gsignal(BX), DI
+	MOVL	DI, g(AX)
+
+	// load args and call sighandler
+	MOVL	CX, 0(SP)
+	MOVL	DX, 4(SP)
+	MOVL	BP, 8(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVL	12(SP), BP
+	MOVL	BP, g(BX)
+
+	// call noted(AX)
+	MOVL	AX, 0(SP)
+	CALL	runtime·noted(SB)
+	RET
+
+// Only used by the 64-bit runtime.
+TEXT runtime·setfpmasks(SB),7,$0
+	RET
diff --git a/src/pkg/runtime/sys_plan9_amd64.s b/src/pkg/runtime/sys_plan9_amd64.s
new file mode 100644
index 000000000..b34f98a68
--- /dev/null
+++ b/src/pkg/runtime/sys_plan9_amd64.s
@@ -0,0 +1,208 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "zasm_GOOS_GOARCH.h"
+
+// setldt(int entry, int address, int limit)
+TEXT runtime·setldt(SB),7,$0
+	RET
+
+TEXT runtime·open(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$14, BP
+	SYSCALL
+	RET
+
+TEXT runtime·pread(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$50, BP
+	SYSCALL
+	RET
+
+TEXT runtime·pwrite(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$51, BP
+	SYSCALL
+	RET
+
+// int32 _seek(int64*, int32, int64, int32)
+TEXT _seek<>(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$39, BP
+	SYSCALL
+	RET
+
+// int64 seek(int32, int64, int32)
+TEXT runtime·seek(SB),7,$56
+	LEAQ	new+48(SP), CX
+	MOVQ	CX, 0(SP)
+	MOVQ	fd+0(FP), CX
+	MOVQ	CX, 8(SP)
+	MOVQ	off+8(FP), CX
+	MOVQ	CX, 16(SP)
+	MOVQ	whence+16(FP), CX
+	MOVQ	CX, 24(SP)
+	CALL	_seek<>(SB)
+	CMPL	AX, $0
+	JGE	2(PC)
+	MOVQ	$-1, new+48(SP)
+	MOVQ	new+48(SP), AX
+	RET
+
+TEXT runtime·close(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$4, BP
+	SYSCALL
+	RET
+
+TEXT runtime·exits(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$8, BP
+	SYSCALL
+	RET
+
+TEXT runtime·brk_(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$24, BP
+	SYSCALL
+	RET
+
+TEXT runtime·sleep(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$17, BP
+	SYSCALL
+	RET
+
+TEXT runtime·plan9_semacquire(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$37, BP
+	SYSCALL
+	RET
+
+TEXT runtime·plan9_tsemacquire(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$52, BP
+	SYSCALL
+	RET
+
+TEXT runtime·notify(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$28, BP
+	SYSCALL
+	RET
+
+TEXT runtime·noted(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$29, BP
+	SYSCALL
+	RET
+	
+TEXT runtime·plan9_semrelease(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$38, BP
+	SYSCALL
+	RET
+
+TEXT runtime·nanotime(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$60, BP
+	SYSCALL
+	RET
+
+TEXT runtime·rfork(SB),7,$0
+	MOVQ	$0x8000, AX
+	MOVQ	$19, BP // rfork
+	SYSCALL
+
+	// In parent, return.
+	CMPQ	AX, $0
+	JEQ	2(PC)
+	RET
+
+	// In child on forked stack.
+	MOVQ	mm+24(SP), BX	// m
+	MOVQ	gg+32(SP), DX	// g
+	MOVQ	fn+40(SP), SI	// fn
+
+	// set SP to be on the new child stack
+	MOVQ	stack+16(SP), CX
+	MOVQ	CX, SP
+
+	// Initialize m, g.
+	get_tls(AX)
+	MOVQ	DX, g(AX)
+	MOVQ	BX, m(AX)
+
+	// Initialize AX from pid in TLS.
+	MOVQ	procid(AX), AX
+	MOVQ	AX, m_procid(BX)	// save pid as m->procid
+	
+	CALL	runtime·stackcheck(SB)	// smashes AX, CX
+	
+	MOVQ	0(DX), DX	// paranoia; check they are not nil
+	MOVQ	0(BX), BX
+	
+	CALL	SI	// fn()
+	CALL	runtime·exit(SB)
+	RET
+
+// This is needed by asm_amd64.s
+TEXT runtime·settls(SB),7,$0
+	RET
+
+// void sigtramp(void *ureg, int8 *note)
+TEXT runtime·sigtramp(SB),7,$0
+	get_tls(AX)
+
+	// check that m exists
+	MOVQ	m(AX), BX
+	CMPQ	BX, $0
+	JNE	3(PC)
+	CALL	runtime·badsignal(SB) // will exit
+	RET
+
+	// save args
+	MOVQ	ureg+8(SP), CX
+	MOVQ	note+16(SP), DX
+
+	// change stack
+	MOVQ	m_gsignal(BX), R10
+	MOVQ	g_stackbase(R10), BP
+	MOVQ	BP, SP
+
+	// make room for args and g
+	SUBQ	$32, SP
+
+	// save g
+	MOVQ	g(AX), BP
+	MOVQ	BP, 24(SP)
+
+	// g = m->gsignal
+	MOVQ	R10, g(AX)
+
+	// load args and call sighandler
+	MOVQ	CX, 0(SP)
+	MOVQ	DX, 8(SP)
+	MOVQ	BP, 16(SP)
+
+	CALL	runtime·sighandler(SB)
+
+	// restore g
+	get_tls(BX)
+	MOVQ	24(SP), R10
+	MOVQ	R10, g(BX)
+
+	// call noted(AX)
+	MOVQ	AX, 0(SP)
+	CALL	runtime·noted(SB)
+	RET
+
+TEXT runtime·setfpmasks(SB),7,$8
+	STMXCSR	0(SP)
+	MOVL	0(SP), AX
+	ANDL	$~0x3F, AX
+	ORL	$(0x3F<<7), AX
+	MOVL	AX, 0(SP)
+	LDMXCSR	0(SP)
+	RET
diff --git a/src/pkg/runtime/sys_windows_386.s b/src/pkg/runtime/sys_windows_386.s
index d5646bfea..ca59f0a1d 100644
--- a/src/pkg/runtime/sys_windows_386.s
+++ b/src/pkg/runtime/sys_windows_386.s
@@ -216,7 +216,7 @@ TEXT runtime·callbackasm+0(SB),7,$0
 
 	CLD
 
-	CALL	runtime·cgocallback(SB)
+	CALL	runtime·cgocallback_gofunc(SB)
 
 	POPL	AX
 	POPL	CX
@@ -243,11 +243,6 @@ TEXT runtime·tstart(SB),7,$0
 	MOVL	newm+4(SP), CX		// m
 	MOVL	m_g0(CX), DX		// g
 
-	// Set up SEH frame
-	PUSHL	$runtime·sigtramp(SB)
-	PUSHL	0(FS)
-	MOVL	SP, 0(FS)
-
 	// Layout new m scheduler stack on os stack.
 	MOVL	SP, AX
 	MOVL	AX, g_stackbase(DX)
@@ -264,14 +259,8 @@ TEXT runtime·tstart(SB),7,$0
 	CLD
 
 	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
-
 	CALL	runtime·mstart(SB)
 
-	// Pop SEH frame
-	MOVL	0(FS), SP
-	POPL	0(FS)
-	POPL	CX
-
 	RET
 
 // uint32 tstart_stdcall(M *newm);
@@ -296,3 +285,32 @@ TEXT runtime·setldt(SB),7,$0
 	MOVL	address+4(FP), CX
 	MOVL	CX, 0x14(FS)
 	RET
+
+// void install_exception_handler()
+TEXT runtime·install_exception_handler(SB),7,$0
+	get_tls(CX)
+	MOVL	m(CX), CX		// m
+
+	// Set up SEH frame
+	MOVL	m_seh(CX), DX
+	MOVL	$runtime·sigtramp(SB), AX
+	MOVL	AX, seh_handler(DX)
+	MOVL	0(FS), AX
+	MOVL	AX, seh_prev(DX)
+
+	// Install it
+	MOVL	DX, 0(FS)
+
+	RET
+
+// void remove_exception_handler()
+TEXT runtime·remove_exception_handler(SB),7,$0
+	get_tls(CX)
+	MOVL	m(CX), CX		// m
+
+	// Remove SEH frame
+	MOVL	m_seh(CX), DX
+	MOVL	seh_prev(DX), AX
+	MOVL	AX, 0(FS)
+
+	RET
diff --git a/src/pkg/runtime/sys_windows_amd64.s b/src/pkg/runtime/sys_windows_amd64.s
index 11909cda2..fe88f3b75 100644
--- a/src/pkg/runtime/sys_windows_amd64.s
+++ b/src/pkg/runtime/sys_windows_amd64.s
@@ -272,13 +272,13 @@ TEXT runtime·callbackasm(SB),7,$0
 	MOVQ	R15, 0(SP)
 
 	// prepare call stack.  use SUBQ to hide from stack frame checks
-	// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+	// cgocallback(Go func, void *frame, uintptr framesize)
 	SUBQ	$24, SP
 	MOVQ	DX, 16(SP)	// uintptr framesize
 	MOVQ	CX, 8(SP)   // void *frame
-	MOVQ	AX, 0(SP)    // void (*fn)(void*)
+	MOVQ	AX, 0(SP)    // Go func
 	CLD
-	CALL  runtime·cgocallback(SB)
+	CALL  runtime·cgocallback_gofunc(SB)
 	MOVQ	0(SP), AX
 	MOVQ	8(SP), CX
 	MOVQ	16(SP), DX
@@ -328,7 +328,6 @@ TEXT runtime·tstart_stdcall(SB),7,$0
 	// Someday the convention will be D is always cleared.
 	CLD
 
-	CALL	runtime·setstacklimits(SB)
 	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
 	CALL	runtime·mstart(SB)
 
@@ -337,6 +336,13 @@ TEXT runtime·tstart_stdcall(SB),7,$0
 
 // set tls base to DI
 TEXT runtime·settls(SB),7,$0
-	CALL	runtime·setstacklimits(SB)
 	MOVQ	DI, 0x28(GS)
 	RET
+
+// void install_exception_handler()
+TEXT runtime·install_exception_handler(SB),7,$0
+	CALL	runtime·setstacklimits(SB)
+	RET
+
+TEXT runtime·remove_exception_handler(SB),7,$0
+	RET
diff --git a/src/pkg/runtime/syscall_windows_test.go b/src/pkg/runtime/syscall_windows_test.go
index c8327fdef..f04d2cd54 100644
--- a/src/pkg/runtime/syscall_windows_test.go
+++ b/src/pkg/runtime/syscall_windows_test.go
@@ -112,9 +112,10 @@ func Test64BitReturnStdCall(t *testing.T) {
 
 func TestCDecl(t *testing.T) {
 	var buf [50]byte
+	fmtp, _ := syscall.BytePtrFromString("%d %d %d")
 	a, _, _ := GetDLL(t, "user32.dll").Proc("wsprintfA").Call(
 		uintptr(unsafe.Pointer(&buf[0])),
-		uintptr(unsafe.Pointer(syscall.StringBytePtr("%d %d %d"))),
+		uintptr(unsafe.Pointer(fmtp)),
 		1000, 2000, 3000)
 	if string(buf[:a]) != "1000 2000 3000" {
 		t.Error("cdecl USER32.wsprintfA returns", a, "buf=", buf[:a])
diff --git a/src/pkg/runtime/thread_darwin.c b/src/pkg/runtime/thread_darwin.c
index 6a83e48a3..adb1ffe6a 100644
--- a/src/pkg/runtime/thread_darwin.c
+++ b/src/pkg/runtime/thread_darwin.c
@@ -9,8 +9,8 @@
 
 extern SigTab runtime·sigtab[];
 
-static Sigset sigset_all = ~(Sigset)0;
 static Sigset sigset_none;
+static Sigset sigset_all = ~(Sigset)0;
 static Sigset sigset_prof = 1<<(SIGPROF-1);
 
 static void
@@ -50,11 +50,8 @@ runtime·semacreate(void)
 void
 runtime·osinit(void)
 {
-	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
-	// but only if we're not using cgo.  If we are using cgo we need
-	// to let the C pthread libary install its own thread-creation callback.
-	if(!runtime·iscgo)
-		runtime·bsdthread_register();
+	// bsdthread_register delayed until end of goenvs so that we
+	// can look at the environment first.
 
 	// Use sysctl to fetch hw.ncpu.
 	uint32 mib[2];
@@ -75,22 +72,34 @@ void
 runtime·goenvs(void)
 {
 	runtime·goenvs_unix();
+
+	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
+	// but only if we're not using cgo.  If we are using cgo we need
+	// to let the C pthread libary install its own thread-creation callback.
+	if(!runtime·iscgo) {
+		if(runtime·bsdthread_register() != 0) {
+			if(runtime·getenv("DYLD_INSERT_LIBRARIES"))
+				runtime·throw("runtime: bsdthread_register error (unset DYLD_INSERT_LIBRARIES)");
+			runtime·throw("runtime: bsdthread_register error");
+		}
+	}
+
 }
 
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·newosproc(M *mp, void *stk)
 {
 	int32 errno;
 	Sigset oset;
 
-	m->tls[0] = m->id;	// so 386 asm can find it
+	mp->tls[0] = mp->id;	// so 386 asm can find it
 	if(0){
-		runtime·printf("newosproc stk=%p m=%p g=%p fn=%p id=%d/%d ostk=%p\n",
-			stk, m, g, fn, m->id, m->tls[0], &m);
+		runtime·printf("newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
 	}
 
 	runtime·sigprocmask(SIG_SETMASK, &sigset_all, &oset);
-	errno = runtime·bsdthread_create(stk, m, g, fn);
+	errno = runtime·bsdthread_create(stk, mp, mp->g0, runtime·mstart);
 	runtime·sigprocmask(SIG_SETMASK, &oset, nil);
 
 	if(errno < 0) {
@@ -100,17 +109,30 @@ runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
 }
 
 // Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
 	// Initialize signal handling.
-	m->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
-	runtime·signalstack(m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
 
-	if(m->profilehz > 0)
-		runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
-	else
-		runtime·sigprocmask(SIG_SETMASK, &sigset_prof, nil);
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+	runtime·setprof(m->profilehz > 0);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
 }
 
 // Mach IPC, to get at semaphores
@@ -431,10 +453,11 @@ runtime·sigpanic(void)
 	runtime·panicstring(runtime·sigtab[g->sig].name);
 }
 
-// TODO(rsc): place holder to fix build.
+#pragma textflag 7
 void
 runtime·osyield(void)
 {
+	runtime·usleep(1);
 }
 
 uintptr
@@ -488,12 +511,22 @@ runtime·badcallback(void)
 	runtime·write(2, badcallback, sizeof badcallback - 1);
 }
 
-static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+static int8 badsignal[] = "runtime: signal received on thread not created by Go: ";
 
 // This runs on a foreign stack, without an m or a g.  No stack split.
 #pragma textflag 7
 void
-runtime·badsignal(void)
+runtime·badsignal(int32 sig)
 {
+	if (sig == SIGPROF) {
+		return;  // Ignore SIGPROFs intended for a non-Go thread.
+	}
 	runtime·write(2, badsignal, sizeof badsignal - 1);
+	if (0 <= sig && sig < NSIG) {
+		// Call runtime·findnull dynamically to circumvent static stack size check.
+		static int32 (*findnull)(byte*) = runtime·findnull;
+		runtime·write(2, runtime·sigtab[sig].name, findnull((byte*)runtime·sigtab[sig].name));
+	}
+	runtime·write(2, "\n", 1);
+	runtime·exit(1);
 }
diff --git a/src/pkg/runtime/thread_freebsd.c b/src/pkg/runtime/thread_freebsd.c
index 4c546178f..3ae14ee0a 100644
--- a/src/pkg/runtime/thread_freebsd.c
+++ b/src/pkg/runtime/thread_freebsd.c
@@ -13,8 +13,8 @@ extern int32 runtime·sys_umtx_op(uint32*, int32, uint32, void*, void*);
 #define	CTL_HW	6
 #define	HW_NCPU	3
 
+static Sigset sigset_none;
 static Sigset sigset_all = { ~(uint32)0, ~(uint32)0, ~(uint32)0, ~(uint32)0, };
-static Sigset sigset_none = { 0, 0, 0, 0, };
 
 static int32
 getncpu(void)
@@ -77,32 +77,33 @@ runtime·futexwakeup(uint32 *addr, uint32 cnt)
 void runtime·thr_start(void*);
 
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·newosproc(M *mp, void *stk)
 {
 	ThrParam param;
 	Sigset oset;
 
-	USED(fn);	// thr_start assumes fn == mstart
-	USED(g);	// thr_start assumes g == m->g0
-
 	if(0){
-		runtime·printf("newosproc stk=%p m=%p g=%p fn=%p id=%d/%d ostk=%p\n",
-			stk, m, g, fn, m->id, m->tls[0], &m);
+		runtime·printf("newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
 	}
 
 	runtime·sigprocmask(&sigset_all, &oset);
 	runtime·memclr((byte*)&param, sizeof param);
 
 	param.start_func = runtime·thr_start;
-	param.arg = m;
-	param.stack_base = (int8*)g->stackbase;
-	param.stack_size = (byte*)stk - (byte*)g->stackbase;
-	param.child_tid = (intptr*)&m->procid;
+	param.arg = (byte*)mp;
+	
+	// NOTE(rsc): This code is confused. stackbase is the top of the stack
+	// and is equal to stk. However, it's working, so I'm not changing it.
+	param.stack_base = (void*)mp->g0->stackbase;
+	param.stack_size = (byte*)stk - (byte*)mp->g0->stackbase;
+
+	param.child_tid = (intptr*)&mp->procid;
 	param.parent_tid = nil;
-	param.tls_base = (int8*)&m->tls[0];
-	param.tls_size = sizeof m->tls;
+	param.tls_base = (void*)&mp->tls[0];
+	param.tls_size = sizeof mp->tls;
 
-	m->tls[0] = m->id;	// so 386 asm can find it
+	mp->tls[0] = mp->id;	// so 386 asm can find it
 
 	runtime·thr_new(&param, sizeof param);
 	runtime·sigprocmask(&oset, nil);
@@ -121,15 +122,30 @@ runtime·goenvs(void)
 }
 
 // Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
 	// Initialize signal handling
-	m->gsignal = runtime·malg(32*1024);
-	runtime·signalstack(m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
 	runtime·sigprocmask(&sigset_none, nil);
 }
 
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
+}
+
 void
 runtime·sigpanic(void)
 {
@@ -206,12 +222,22 @@ runtime·badcallback(void)
 	runtime·write(2, badcallback, sizeof badcallback - 1);
 }
 
-static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+static int8 badsignal[] = "runtime: signal received on thread not created by Go: ";
 
 // This runs on a foreign stack, without an m or a g.  No stack split.
 #pragma textflag 7
 void
-runtime·badsignal(void)
+runtime·badsignal(int32 sig)
 {
+	if (sig == SIGPROF) {
+		return;  // Ignore SIGPROFs intended for a non-Go thread.
+	}
 	runtime·write(2, badsignal, sizeof badsignal - 1);
+	if (0 <= sig && sig < NSIG) {
+		// Call runtime·findnull dynamically to circumvent static stack size check.
+		static int32 (*findnull)(byte*) = runtime·findnull;
+		runtime·write(2, runtime·sigtab[sig].name, findnull((byte*)runtime·sigtab[sig].name));
+	}
+	runtime·write(2, "\n", 1);
+	runtime·exit(1);
 }
diff --git a/src/pkg/runtime/thread_linux.c b/src/pkg/runtime/thread_linux.c
index 858be7036..78ddef878 100644
--- a/src/pkg/runtime/thread_linux.c
+++ b/src/pkg/runtime/thread_linux.c
@@ -13,8 +13,8 @@ int32 runtime·open(uint8*, int32, int32);
 int32 runtime·close(int32);
 int32 runtime·read(int32, void*, int32);
 
-static Sigset sigset_all = { ~(uint32)0, ~(uint32)0 };
 static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0 };
 
 // Linux futex.
 //
@@ -80,33 +80,23 @@ runtime·futexwakeup(uint32 *addr, uint32 cnt)
 	*(int32*)0x1006 = 0x1006;
 }
 
+extern runtime·sched_getaffinity(uintptr pid, uintptr len, uintptr *buf);
 static int32
 getproccount(void)
 {
-	int32 fd, rd, cnt, cpustrlen;
-	byte *cpustr, *pos, *bufpos;
-	byte buf[256];
+	uintptr buf[16], t;
+	int32 r, cnt, i;
 
-	fd = runtime·open((byte*)"/proc/stat", O_RDONLY|O_CLOEXEC, 0);
-	if(fd == -1)
-		return 1;
 	cnt = 0;
-	bufpos = buf;
-	cpustr = (byte*)"\ncpu";
-	cpustrlen = runtime·findnull(cpustr);
-	for(;;) {
-		rd = runtime·read(fd, bufpos, sizeof(buf)-cpustrlen);
-		if(rd == -1)
-			break;
-		bufpos[rd] = 0;
-		for(pos=buf; pos=runtime·strstr(pos, cpustr); cnt++, pos++) {
-		}
-		if(rd < cpustrlen)
-			break;
-		runtime·memmove(buf, bufpos+rd-cpustrlen+1, cpustrlen-1);
-		bufpos = buf+cpustrlen-1;
+	r = runtime·sched_getaffinity(0, sizeof(buf), buf);
+	if(r > 0)
+	for(i = 0; i < r/sizeof(buf[0]); i++) {
+		t = buf[i];
+		t = t - ((t >> 1) & 0x5555555555555555ULL);
+		t = (t & 0x3333333333333333ULL) + ((t >> 2) & 0x3333333333333333ULL);
+		cnt += (int32)((((t + (t >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
 	}
-	runtime·close(fd);
+
 	return cnt ? cnt : 1;
 }
 
@@ -134,7 +124,7 @@ enum
 };
 
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·newosproc(M *mp, void *stk)
 {
 	int32 ret;
 	int32 flags;
@@ -150,16 +140,16 @@ runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
 		| CLONE_THREAD	/* revisit - okay for now */
 		;
 
-	m->tls[0] = m->id;	// so 386 asm can find it
+	mp->tls[0] = mp->id;	// so 386 asm can find it
 	if(0){
-		runtime·printf("newosproc stk=%p m=%p g=%p fn=%p clone=%p id=%d/%d ostk=%p\n",
-			stk, m, g, fn, runtime·clone, m->id, m->tls[0], &m);
+		runtime·printf("newosproc stk=%p m=%p g=%p clone=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, runtime·clone, mp->id, (int32)mp->tls[0], &mp);
 	}
 
 	// Disable signals during clone, so that the new thread starts
 	// with signals disabled.  It will enable them in minit.
 	runtime·rtsigprocmask(SIG_SETMASK, &sigset_all, &oset, sizeof oset);
-	ret = runtime·clone(flags, stk, m, g, fn);
+	ret = runtime·clone(flags, stk, mp, mp->g0, runtime·mstart);
 	runtime·rtsigprocmask(SIG_SETMASK, &oset, nil, sizeof oset);
 
 	if(ret < 0) {
@@ -181,13 +171,28 @@ runtime·goenvs(void)
 }
 
 // Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
 	// Initialize signal handling.
-	m->gsignal = runtime·malg(32*1024);	// OS X wants >=8K, Linux >=2K
-	runtime·signalstack(m->gsignal->stackguard - StackGuard, 32*1024);
-	runtime·rtsigprocmask(SIG_SETMASK, &sigset_none, nil, sizeof sigset_none);
+	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·rtsigprocmask(SIG_SETMASK, &sigset_none, nil, sizeof(Sigset));
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
 }
 
 void
@@ -266,12 +271,22 @@ runtime·badcallback(void)
 	runtime·write(2, badcallback, sizeof badcallback - 1);
 }
 
-static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+static int8 badsignal[] = "runtime: signal received on thread not created by Go: ";
 
 // This runs on a foreign stack, without an m or a g.  No stack split.
 #pragma textflag 7
 void
-runtime·badsignal(void)
+runtime·badsignal(int32 sig)
 {
+	if (sig == SIGPROF) {
+		return;  // Ignore SIGPROFs intended for a non-Go thread.
+	}
 	runtime·write(2, badsignal, sizeof badsignal - 1);
+	if (0 <= sig && sig < NSIG) {
+		// Call runtime·findnull dynamically to circumvent static stack size check.
+		static int32 (*findnull)(byte*) = runtime·findnull;
+		runtime·write(2, runtime·sigtab[sig].name, findnull((byte*)runtime·sigtab[sig].name));
+	}
+	runtime·write(2, "\n", 1);
+	runtime·exit(1);
 }
diff --git a/src/pkg/runtime/thread_netbsd.c b/src/pkg/runtime/thread_netbsd.c
index 1b2df85cd..f333c6dd8 100644
--- a/src/pkg/runtime/thread_netbsd.c
+++ b/src/pkg/runtime/thread_netbsd.c
@@ -11,7 +11,7 @@ enum
 	ESRCH = 3,
 	ENOTSUP = 91,
 
-	// From NetBSD's sys/time.h
+	// From NetBSD's <sys/time.h>
 	CLOCK_REALTIME = 0,
 	CLOCK_VIRTUAL = 1,
 	CLOCK_PROF = 2,
@@ -20,9 +20,15 @@ enum
 
 extern SigTab runtime·sigtab[];
 
-extern int64 runtime·rfork_thread(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
-extern int32 runtime·thrsleep(void *ident, int32 clock_id, void *tsp, void *lock);
-extern int32 runtime·thrwakeup(void *ident, int32 n);
+static Sigset sigset_none;
+static Sigset sigset_all = { ~(uint32)0, ~(uint32)0, ~(uint32)0, ~(uint32)0, };
+
+extern void runtime·getcontext(UcontextT *context);
+extern int32 runtime·lwp_create(UcontextT *context, uintptr flags, void *lwpid);
+extern void runtime·lwp_mcontext_init(void *mc, void *stack, M *mp, G *gp, void (*fn)(void));
+extern int32 runtime·lwp_park(Timespec *abstime, int32 unpark, void *hint, void *unparkhint);
+extern int32 runtime·lwp_unpark(int32 lwp, void *hint);
+extern int32 runtime·lwp_self(void);
 
 // From NetBSD's <sys/sysctl.h>
 #define	CTL_HW	6
@@ -68,13 +74,30 @@ runtime·semasleep(int64 ns)
 		if(m->waitsemacount == 0) {
 			// sleep until semaphore != 0 or timeout.
 			// thrsleep unlocks m->waitsemalock.
-			if(ns < 0)
-				runtime·thrsleep(&m->waitsemacount, 0, nil, &m->waitsemalock);
-			else {
+			if(ns < 0) {
+				// TODO(jsing) - potential deadlock!
+				//
+				// There is a potential deadlock here since we
+				// have to release the waitsemalock mutex
+				// before we call lwp_park() to suspend the
+				// thread. This allows another thread to
+				// release the lock and call lwp_unpark()
+				// before the thread is actually suspended.
+				// If this occurs the current thread will end
+				// up sleeping indefinitely. Unfortunately
+				// the NetBSD kernel does not appear to provide
+				// a mechanism for unlocking the userspace
+				// mutex once the thread is actually parked.
+				runtime·atomicstore(&m->waitsemalock, 0);
+				runtime·lwp_park(nil, 0, &m->waitsemacount, nil);
+			} else {
 				ns += runtime·nanotime();
 				ts.tv_sec = ns/1000000000LL;
 				ts.tv_nsec = ns%1000000000LL;
-				runtime·thrsleep(&m->waitsemacount, CLOCK_REALTIME, &ts, &m->waitsemalock);
+				// TODO(jsing) - potential deadlock!
+				// See above for details.
+				runtime·atomicstore(&m->waitsemalock, 0);
+				runtime·lwp_park(&ts, 0, &m->waitsemacount, nil);
 			}
 			// reacquire lock
 			while(runtime·xchg(&m->waitsemalock, 1))
@@ -112,39 +135,41 @@ runtime·semawakeup(M *mp)
 	while(runtime·xchg(&mp->waitsemalock, 1))
 		runtime·osyield();
 	mp->waitsemacount++;
-	ret = runtime·thrwakeup(&mp->waitsemacount, 1);
+	// TODO(jsing) - potential deadlock, see semasleep() for details.
+	// Confirm that LWP is parked before unparking...
+	ret = runtime·lwp_unpark(mp->procid, &mp->waitsemacount);
 	if(ret != 0 && ret != ESRCH)
 		runtime·printf("thrwakeup addr=%p sem=%d ret=%d\n", &mp->waitsemacount, mp->waitsemacount, ret);
 	// spin-mutex unlock
 	runtime·atomicstore(&mp->waitsemalock, 0);
 }
 
-// From NetBSD's sys/param.h
-#define	RFPROC		(1<<4)	/* change child (else changes curproc) */
-#define	RFMEM		(1<<5)	/* share `address space' */
-#define	RFNOWAIT	(1<<6)	/* parent need not wait() on child */
-#define	RFTHREAD	(1<<13)	/* create a thread, not a process */
-
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·newosproc(M *mp, void *stk)
 {
-	int32 flags;
+	UcontextT uc;
 	int32 ret;
 
-	flags = RFPROC | RFTHREAD | RFMEM | RFNOWAIT;
-
-	if (0) {
+	if(0) {
 		runtime·printf(
-			"newosproc stk=%p m=%p g=%p fn=%p id=%d/%d ostk=%p\n",
-			stk, m, g, fn, m->id, m->tls[0], &m);
+			"newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
 	}
 
-	m->tls[0] = m->id;	// so 386 asm can find it
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+
+	runtime·getcontext(&uc);
+	
+	uc.uc_flags = _UC_SIGMASK | _UC_CPU;
+	uc.uc_link = nil;
+	uc.uc_sigmask = sigset_all;
+
+	runtime·lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp->g0, runtime·mstart);
 
-	if((ret = runtime·rfork_thread(flags, stk, m, g, fn)) < 0) {
+	ret = runtime·lwp_create(&uc, 0, &mp->procid);
+
+	if(ret < 0) {
 		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount() - 1, -ret);
-		if (ret == -ENOTSUP)
-			runtime·printf("runtime: is kern.rthreads disabled?\n");
 		runtime·throw("runtime.newosproc");
 	}
 }
@@ -162,12 +187,30 @@ runtime·goenvs(void)
 }
 
 // Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
+	m->procid = runtime·lwp_self();
+
 	// Initialize signal handling
-	m->gsignal = runtime·malg(32*1024);
-	runtime·signalstack(m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·sigprocmask(SIG_SETMASK, &sigset_none, nil);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
 }
 
 void
@@ -224,12 +267,22 @@ runtime·badcallback(void)
 	runtime·write(2, badcallback, sizeof badcallback - 1);
 }
 
-static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+static int8 badsignal[] = "runtime: signal received on thread not created by Go: ";
 
 // This runs on a foreign stack, without an m or a g.  No stack split.
 #pragma textflag 7
 void
-runtime·badsignal(void)
+runtime·badsignal(int32 sig)
 {
+	if (sig == SIGPROF) {
+		return;  // Ignore SIGPROFs intended for a non-Go thread.
+	}
 	runtime·write(2, badsignal, sizeof badsignal - 1);
+	if (0 <= sig && sig < NSIG) {
+		// Call runtime·findnull dynamically to circumvent static stack size check.
+		static int32 (*findnull)(byte*) = runtime·findnull;
+		runtime·write(2, runtime·sigtab[sig].name, findnull((byte*)runtime·sigtab[sig].name));
+	}
+	runtime·write(2, "\n", 1);
+	runtime·exit(1);
 }
diff --git a/src/pkg/runtime/thread_openbsd.c b/src/pkg/runtime/thread_openbsd.c
index d0f947210..700c48147 100644
--- a/src/pkg/runtime/thread_openbsd.c
+++ b/src/pkg/runtime/thread_openbsd.c
@@ -20,8 +20,11 @@ enum
 
 extern SigTab runtime·sigtab[];
 
-extern int64 runtime·rfork_thread(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
-extern int32 runtime·thrsleep(void *ident, int32 clock_id, void *tsp, void *lock);
+static Sigset sigset_none;
+static Sigset sigset_all = ~(Sigset)0;
+
+extern int64 runtime·tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
+extern int32 runtime·thrsleep(void *ident, int32 clock_id, void *tsp, void *lock, const int32 *abort);
 extern int32 runtime·thrwakeup(void *ident, int32 n);
 
 // From OpenBSD's <sys/sysctl.h>
@@ -69,12 +72,12 @@ runtime·semasleep(int64 ns)
 			// sleep until semaphore != 0 or timeout.
 			// thrsleep unlocks m->waitsemalock.
 			if(ns < 0)
-				runtime·thrsleep(&m->waitsemacount, 0, nil, &m->waitsemalock);
+				runtime·thrsleep(&m->waitsemacount, 0, nil, &m->waitsemalock, nil);
 			else {
 				ns += runtime·nanotime();
 				ts.tv_sec = ns/1000000000LL;
 				ts.tv_nsec = ns%1000000000LL;
-				runtime·thrsleep(&m->waitsemacount, CLOCK_REALTIME, &ts, &m->waitsemalock);
+				runtime·thrsleep(&m->waitsemacount, CLOCK_REALTIME, &ts, &m->waitsemalock, nil);
 			}
 			// reacquire lock
 			while(runtime·xchg(&m->waitsemalock, 1))
@@ -119,29 +122,30 @@ runtime·semawakeup(M *mp)
 	runtime·atomicstore(&mp->waitsemalock, 0);
 }
 
-// From OpenBSD's sys/param.h
-#define	RFPROC		(1<<4)	/* change child (else changes curproc) */
-#define	RFMEM		(1<<5)	/* share `address space' */
-#define	RFNOWAIT	(1<<6)	/* parent need not wait() on child */
-#define	RFTHREAD	(1<<13)	/* create a thread, not a process */
-
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·newosproc(M *mp, void *stk)
 {
-	int32 flags;
+	Tfork param;
+	Sigset oset;
 	int32 ret;
 
-	flags = RFPROC | RFTHREAD | RFMEM | RFNOWAIT;
-
-	if (0) {
+	if(0) {
 		runtime·printf(
-			"newosproc stk=%p m=%p g=%p fn=%p id=%d/%d ostk=%p\n",
-			stk, m, g, fn, m->id, m->tls[0], &m);
+			"newosproc stk=%p m=%p g=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, mp->id, (int32)mp->tls[0], &mp);
 	}
 
-	m->tls[0] = m->id;	// so 386 asm can find it
+	mp->tls[0] = mp->id;	// so 386 asm can find it
+
+	param.tf_tcb = (byte*)&mp->tls[0];
+	param.tf_tid = (int32*)&mp->procid;
+	param.tf_stack = stk;
 
-	if((ret = runtime·rfork_thread(flags, stk, m, g, fn)) < 0) {
+	oset = runtime·sigprocmask(SIG_SETMASK, sigset_all);
+	ret = runtime·tfork((byte*)&param, sizeof(param), mp, mp->g0, runtime·mstart);
+	runtime·sigprocmask(SIG_SETMASK, oset);
+
+	if(ret < 0) {
 		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount() - 1, -ret);
 		if (ret == -ENOTSUP)
 			runtime·printf("runtime: is kern.rthreads disabled?\n");
@@ -162,12 +166,28 @@ runtime·goenvs(void)
 }
 
 // Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	mp->gsignal = runtime·malg(32*1024);
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
 	// Initialize signal handling
-	m->gsignal = runtime·malg(32*1024);
-	runtime·signalstack(m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·signalstack((byte*)m->gsignal->stackguard - StackGuard, 32*1024);
+	runtime·sigprocmask(SIG_SETMASK, sigset_none);
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·signalstack(nil, 0);
 }
 
 void
@@ -224,12 +244,22 @@ runtime·badcallback(void)
 	runtime·write(2, badcallback, sizeof badcallback - 1);
 }
 
-static int8 badsignal[] = "runtime: signal received on thread not created by Go.\n";
+static int8 badsignal[] = "runtime: signal received on thread not created by Go: ";
 
 // This runs on a foreign stack, without an m or a g.  No stack split.
 #pragma textflag 7
 void
-runtime·badsignal(void)
+runtime·badsignal(int32 sig)
 {
+	if (sig == SIGPROF) {
+		return;  // Ignore SIGPROFs intended for a non-Go thread.
+	}
 	runtime·write(2, badsignal, sizeof badsignal - 1);
+	if (0 <= sig && sig < NSIG) {
+		// Call runtime·findnull dynamically to circumvent static stack size check.
+		static int32 (*findnull)(byte*) = runtime·findnull;
+		runtime·write(2, runtime·sigtab[sig].name, findnull((byte*)runtime·sigtab[sig].name));
+	}
+	runtime·write(2, "\n", 1);
+	runtime·exit(1);
 }
diff --git a/src/pkg/runtime/thread_plan9.c b/src/pkg/runtime/thread_plan9.c
index 3b0dca69f..7f94623e7 100644
--- a/src/pkg/runtime/thread_plan9.c
+++ b/src/pkg/runtime/thread_plan9.c
@@ -7,12 +7,37 @@
 #include "arch_GOARCH.h"
 
 int8 *goos = "plan9";
+extern SigTab runtime·sigtab[];
 
+int32 runtime·postnote(int32, int8*);
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	// Initialize stack and goroutine for note handling.
+	mp->gsignal = runtime·malg(32*1024);
+	mp->notesig = (int8*)runtime·malloc(ERRMAX*sizeof(int8));
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
+	// Mask all SSE floating-point exceptions
+	// when running on the 64-bit kernel.
+	runtime·setfpmasks();
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
 }
 
+
 static int32
 getproccount(void)
 {
@@ -36,10 +61,30 @@ getproccount(void)
 	return ncpu > 0 ? ncpu : 1;
 }
 
+static int32
+getpid(void)
+{
+	byte b[20], *c;
+	int32 fd;
+
+	runtime·memclr(b, sizeof(b));
+	fd = runtime·open((byte*)"#c/pid", 0);
+	if(fd >= 0) {
+		runtime·read(fd, b, sizeof(b));
+		runtime·close(fd);
+	}
+	c = b;
+	while(*c == ' ' || *c == '\t')
+		c++;
+	return runtime·atoi(c);
+}
+
 void
 runtime·osinit(void)
 {
 	runtime·ncpu = getproccount();
+	m->procid = getpid();
+	runtime·notify(runtime·sigtramp);
 }
 
 void
@@ -52,6 +97,7 @@ runtime·initsig(void)
 {
 }
 
+#pragma textflag 7
 void
 runtime·osyield(void)
 {
@@ -69,34 +115,6 @@ runtime·usleep(uint32 µs)
 	runtime·sleep(ms);
 }
 
-int64
-runtime·nanotime(void)
-{
-	static int32 fd = -1;
-	byte b[8];
-	uint32 hi, lo;
-
-	// As long as all goroutines share the same file
-	// descriptor table we can get away with using
-	// just a static fd.  Without a lock the file can
-	// be opened twice but that's okay.
-	//
-	// Using /dev/bintime gives us a latency on the
-	// order of ten microseconds between two calls.
-	//
-	// The naïve implementation (without the cached
-	// file descriptor) is roughly four times slower
-	// in 9vx on a 2.16 GHz Intel Core 2 Duo.
-
-	if(fd < 0 && (fd = runtime·open((byte*)"/dev/bintime", OREAD|OCEXEC)) < 0)
-		return 0;
-	if(runtime·pread(fd, b, sizeof b, 0) != sizeof b)
-		return 0;
-	hi = b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3];
-	lo = b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7];
-	return (int64)hi<<32 | (int64)lo;
-}
-
 void
 time·now(int64 sec, int32 nsec)
 {
@@ -109,48 +127,111 @@ time·now(int64 sec, int32 nsec)
 	FLUSH(&nsec);
 }
 
-extern Tos *_tos;
 void
-runtime·exit(int32)
+runtime·itoa(int32 n, byte *p, uint32 len)
 {
-	int32 fd;
+	byte *q, c;
+	uint32 i;
+
+	if(len <= 1)
+		return;
+
+	runtime·memclr(p, len);
+	q = p;
+
+	if(n==0) {
+		*q++ = '0';
+		USED(q);
+		return;
+	}
+	if(n < 0) {
+		*q++ = '-';
+		p++;
+		n = -n;
+	}
+	for(i=0; n > 0 && i < len; i++) {
+		*q++ = '0' + (n%10);
+		n = n/10;
+	}
+	for(q--; q >= p; ) {
+		c = *p;
+		*p++ = *q;
+		*q-- = c;
+	}
+}
+
+void
+runtime·goexitsall(int8 *status)
+{
+	M *mp;
+	int32 pid;
+
+	pid = getpid();
+	for(mp=runtime·atomicloadp(&runtime·allm); mp; mp=mp->alllink)
+		if(mp->procid != pid)
+			runtime·postnote(mp->procid, status);
+}
+
+int32
+runtime·postnote(int32 pid, int8* msg)
+{
+	int32 fd, len;
 	uint8 buf[128];
 	uint8 tmp[16];
 	uint8 *p, *q;
-	int32 pid;
 
 	runtime·memclr(buf, sizeof buf);
-	runtime·memclr(tmp, sizeof tmp);
-	pid = _tos->pid;
 
-	/* build path string /proc/pid/notepg */
-	for(q=tmp; pid > 0;) {
-		*q++ = '0' + (pid%10);
-		pid = pid/10;
-	}
+	/* build path string /proc/pid/note */
+	q = tmp;
 	p = buf;
+	runtime·itoa(pid, tmp, sizeof tmp);
 	runtime·memmove((void*)p, (void*)"/proc/", 6);
-	p += 6;
-	for(q--; q >= tmp;)
-		*p++ = *q--;
-	runtime·memmove((void*)p, (void*)"/notepg", 7);
+	for(p += 6; *p++ = *q++; );
+	p--;
+	runtime·memmove((void*)p, (void*)"/note", 5);
 
-	/* post interrupt note */
 	fd = runtime·open(buf, OWRITE);
-	runtime·write(fd, "interrupt", 9);
-	runtime·exits(nil);
+	if(fd < 0)
+		return -1;
+
+	len = runtime·findnull((byte*)msg);
+	if(runtime·write(fd, msg, len) != len) {
+		runtime·close(fd);
+		return -1;
+	}
+	runtime·close(fd);
+	return 0;
 }
 
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·exit(int32 e)
 {
-	m->tls[0] = m->id;	// so 386 asm can find it
+	byte tmp[16];
+	int8 *status;
+ 
+	if(e == 0)
+		status = "";
+	else {
+		/* build error string */
+		runtime·itoa(e, tmp, sizeof tmp);
+		status = (int8*)tmp;
+	}
+
+	runtime·goexitsall(status);
+	runtime·exits(status);
+}
+
+void
+runtime·newosproc(M *mp, void *stk)
+{
+	mp->tls[0] = mp->id;	// so 386 asm can find it
 	if(0){
-		runtime·printf("newosproc stk=%p m=%p g=%p fn=%p rfork=%p id=%d/%d ostk=%p\n",
-			stk, m, g, fn, runtime·rfork, m->id, m->tls[0], &m);
+		runtime·printf("newosproc stk=%p m=%p g=%p rfork=%p id=%d/%d ostk=%p\n",
+			stk, mp, mp->g0, runtime·rfork, mp->id, (int32)mp->tls[0], &mp);
 	}
 
-	if(runtime·rfork(RFPROC|RFMEM|RFNOWAIT, stk, m, g, fn) < 0)
+	if(runtime·rfork(RFPROC|RFMEM|RFNOWAIT, stk, mp, mp->g0, runtime·mstart) < 0)
 		runtime·throw("newosproc: rfork failed");
 }
 
@@ -167,36 +248,18 @@ runtime·semasleep(int64 ns)
 	int32 ms;
 
 	if(ns >= 0) {
-		// TODO: Plan 9 needs a new system call, tsemacquire.
-		// The kernel implementation is the same as semacquire
-		// except with a tsleep and check for timeout.
-		// It would be great if the implementation returned the
-		// value that was added to the semaphore, so that on
-		// timeout the return value would be 0, on success 1.
-		// Then the error string does not have to be parsed
-		// to detect timeout.
-		//
-		// If a negative time indicates no timeout, then
-		// semacquire can be implemented (in the kernel)
-		// as tsemacquire(p, v, -1).
-		runtime·throw("semasleep: timed sleep not implemented on Plan 9");
-
-		/*
-		if(ns < 0)
-			ms = -1;
-		else if(ns/1000 > 0x7fffffffll)
+		if(ns/1000000 > 0x7fffffffll)
 			ms = 0x7fffffff;
 		else
-			ms = ns/1000;
-		ret = runtime·plan9_tsemacquire(&m->waitsemacount, 1, ms);
+			ms = ns/1000000;
+		ret = runtime·plan9_tsemacquire(&m->waitsemacount, ms);
 		if(ret == 1)
 			return 0;  // success
 		return -1;  // timeout or interrupted
-		*/
 	}
 
 	while(runtime·plan9_semacquire(&m->waitsemacount, 1) < 0) {
-		/* interrupted; try again */
+		/* interrupted; try again (c.f. lock_sema.c) */
 	}
 	return 0;  // success
 }
@@ -213,15 +276,15 @@ os·sigpipe(void)
 	runtime·throw("too many writes on closed pipe");
 }
 
-/*
- * placeholder - once notes are implemented,
- * a signal generating a panic must appear as
- * a call to this function for correct handling by
- * traceback.
- */
 void
 runtime·sigpanic(void)
 {
+	if(g->sigpc == 0)
+		runtime·panicstring("call of nil func value");
+	runtime·panicstring(m->notesig);
+
+	if(g->sig == 1 || g->sig == 2)
+		runtime·throw("fault");
 }
 
 int32
@@ -266,4 +329,5 @@ void
 runtime·badsignal(void)
 {
 	runtime·pwrite(2, badsignal, sizeof badsignal - 1, -1LL);
+	runtime·exits(badsignal);
 }
diff --git a/src/pkg/runtime/thread_windows.c b/src/pkg/runtime/thread_windows.c
index f684d3733..ae4e82e50 100644
--- a/src/pkg/runtime/thread_windows.c
+++ b/src/pkg/runtime/thread_windows.c
@@ -135,6 +135,7 @@ runtime·write(int32 fd, void *buf, int32 n)
 	return written;
 }
 
+#pragma textflag 7
 void
 runtime·osyield(void)
 {
@@ -186,28 +187,43 @@ runtime·semacreate(void)
 #define STACK_SIZE_PARAM_IS_A_RESERVATION ((uintptr)0x00010000)
 
 void
-runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void))
+runtime·newosproc(M *mp, void *stk)
 {
 	void *thandle;
 
 	USED(stk);
-	USED(g);	// assuming g = m->g0
-	USED(fn);	// assuming fn = mstart
 
 	thandle = runtime·stdcall(runtime·CreateThread, 6,
-		nil, (uintptr)0x20000, runtime·tstart_stdcall, m,
+		nil, (uintptr)0x20000, runtime·tstart_stdcall, mp,
 		STACK_SIZE_PARAM_IS_A_RESERVATION, nil);
 	if(thandle == nil) {
 		runtime·printf("runtime: failed to create new OS thread (have %d already; errno=%d)\n", runtime·mcount(), runtime·getlasterror());
 		runtime·throw("runtime.newosproc");
 	}
-	runtime·atomicstorep(&m->thread, thandle);
+	runtime·atomicstorep(&mp->thread, thandle);
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+void
+runtime·mpreinit(M *mp)
+{
+	USED(mp);
 }
 
 // Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
 void
 runtime·minit(void)
 {
+	runtime·install_exception_handler();
+}
+
+// Called from dropm to undo the effect of an minit.
+void
+runtime·unminit(void)
+{
+	runtime·remove_exception_handler();
 }
 
 int64
diff --git a/src/pkg/runtime/time.goc b/src/pkg/runtime/time.goc
index b18902f00..2babb173d 100644
--- a/src/pkg/runtime/time.goc
+++ b/src/pkg/runtime/time.goc
@@ -11,6 +11,7 @@ package time
 #include "os_GOOS.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
+#include "race.h"
 
 static Timers timers;
 static void addtimer(Timer*);
@@ -23,14 +24,16 @@ static bool deltimer(Timer*);
 
 // Sleep puts the current goroutine to sleep for at least ns nanoseconds.
 func Sleep(ns int64) {
-	g->status = Gwaiting;
-	g->waitreason = "sleep";
-	runtime·tsleep(ns);
+	runtime·tsleep(ns, "sleep");
 }
 
 // startTimer adds t to the timer heap.
 func startTimer(t *Timer) {
+	if(raceenabled)
+		runtime·racerelease(t);
+	runtime·lock(&timers);
 	addtimer(t);
+	runtime·unlock(&timers);
 }
 
 // stopTimer removes t from the timer heap if it is there.
@@ -54,27 +57,28 @@ ready(int64 now, Eface e)
 	runtime·ready(e.data);
 }
 
+static FuncVal readyv = {(void(*)(void))ready};
+
 // Put the current goroutine to sleep for ns nanoseconds.
-// The caller must have set g->status and g->waitreason.
 void
-runtime·tsleep(int64 ns)
+runtime·tsleep(int64 ns, int8 *reason)
 {
 	Timer t;
 
-	if(ns <= 0) {
-		g->status = Grunning;
-		g->waitreason = nil;
+	if(ns <= 0)
 		return;
-	}
 
 	t.when = runtime·nanotime() + ns;
 	t.period = 0;
-	t.f = ready;
+	t.fv = &readyv;
 	t.arg.data = g;
+	runtime·lock(&timers);
 	addtimer(&t);
-	runtime·gosched();
+	runtime·park(runtime·unlock, &timers, reason);
 }
 
+static FuncVal timerprocv = {timerproc};
+
 // Add a timer to the heap and start or kick the timer proc
 // if the new timer is earlier than any of the others.
 static void
@@ -83,7 +87,6 @@ addtimer(Timer *t)
 	int32 n;
 	Timer **nt;
 
-	runtime·lock(&timers);
 	if(timers.len >= timers.cap) {
 		// Grow slice.
 		n = 16;
@@ -109,9 +112,10 @@ addtimer(Timer *t)
 			runtime·ready(timers.timerproc);
 		}
 	}
-	if(timers.timerproc == nil)
-		timers.timerproc = runtime·newproc1((byte*)timerproc, nil, 0, 0, addtimer);
-	runtime·unlock(&timers);
+	if(timers.timerproc == nil) {
+		timers.timerproc = runtime·newproc1(&timerprocv, nil, 0, 0, addtimer);
+		timers.timerproc->issystem = true;
+	}
 }
 
 // Delete timer t from the heap.
@@ -182,26 +186,25 @@ timerproc(void)
 				siftdown(0);
 				t->i = -1;  // mark as removed
 			}
-			f = t->f;
+			f = (void*)t->fv->fn;
 			arg = t->arg;
 			runtime·unlock(&timers);
+			if(raceenabled)
+				runtime·raceacquire(t);
 			f(now, arg);
 			runtime·lock(&timers);
 		}
 		if(delta < 0) {
 			// No timers left - put goroutine to sleep.
 			timers.rescheduling = true;
-			g->status = Gwaiting;
-			g->waitreason = "timer goroutine (idle)";
-			runtime·unlock(&timers);
-			runtime·gosched();
+			runtime·park(runtime·unlock, &timers, "timer goroutine (idle)");
 			continue;
 		}
 		// At least one timer pending.  Sleep until then.
 		timers.sleeping = true;
 		runtime·noteclear(&timers.waitnote);
 		runtime·unlock(&timers);
-		runtime·entersyscall();
+		runtime·entersyscallblock();
 		runtime·notetsleep(&timers.waitnote, delta);
 		runtime·exitsyscall();
 	}
diff --git a/src/pkg/runtime/time_plan9_386.c b/src/pkg/runtime/time_plan9_386.c
new file mode 100644
index 000000000..a29d45715
--- /dev/null
+++ b/src/pkg/runtime/time_plan9_386.c
@@ -0,0 +1,34 @@
+// Copyright 2010 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+#include "os_GOOS.h"
+
+int64
+runtime·nanotime(void)
+{
+	static int32 fd = -1;
+	byte b[8];
+	uint32 hi, lo;
+
+	// As long as all goroutines share the same file
+	// descriptor table we can get away with using
+	// just a static fd.  Without a lock the file can
+	// be opened twice but that's okay.
+	//
+	// Using /dev/bintime gives us a latency on the
+	// order of ten microseconds between two calls.
+	//
+	// The naïve implementation (without the cached
+	// file descriptor) is roughly four times slower
+	// in 9vx on a 2.16 GHz Intel Core 2 Duo.
+
+	if(fd < 0 && (fd = runtime·open((byte*)"/dev/bintime", OREAD|OCEXEC)) < 0)
+		return 0;
+	if(runtime·pread(fd, b, sizeof b, 0) != sizeof b)
+		return 0;
+	hi = b[0]<<24 | b[1]<<16 | b[2]<<8 | b[3];
+	lo = b[4]<<24 | b[5]<<16 | b[6]<<8 | b[7];
+	return (int64)hi<<32 | (int64)lo;
+}
diff --git a/src/pkg/runtime/traceback_arm.c b/src/pkg/runtime/traceback_arm.c
index 22e0bc3a6..dd85cc02c 100644
--- a/src/pkg/runtime/traceback_arm.c
+++ b/src/pkg/runtime/traceback_arm.c
@@ -17,23 +17,23 @@ void _divu(void);
 void _modu(void);
 
 int32
-runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr *pcbuf, int32 max)
+runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *gp, int32 skip, uintptr *pcbuf, int32 max)
 {
 	int32 i, n, iter;
 	uintptr pc, lr, tracepc, x;
-	byte *fp, *p;
+	byte *fp;
 	bool waspanic;
 	Stktop *stk;
 	Func *f;
-	
+
 	pc = (uintptr)pc0;
 	lr = (uintptr)lr0;
 	fp = nil;
 	waspanic = false;
 
 	// If the PC is goexit, the goroutine hasn't started yet.
-	if(pc == (uintptr)runtime·goexit) {
-		pc = (uintptr)g->entry;
+	if(pc == (uintptr)runtime·goexit && gp->fnstart != nil) {
+		pc = (uintptr)gp->fnstart->fn;
 		lr = (uintptr)runtime·goexit;
 	}
 
@@ -45,7 +45,7 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 	}
 
 	n = 0;
-	stk = (Stktop*)g->stackbase;
+	stk = (Stktop*)gp->stackbase;
 	for(iter = 0; iter < 100 && n < max; iter++) {	// iter avoids looping forever
 		// Typically:
 		//	pc is the PC of the running function.
@@ -57,52 +57,17 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		if(pc == (uintptr)runtime·lessstack) {
 			// Hit top of stack segment.  Unwind to next segment.
 			pc = (uintptr)stk->gobuf.pc;
-			sp = stk->gobuf.sp;
+			sp = (byte*)stk->gobuf.sp;
 			lr = 0;
 			fp = nil;
-			if(pcbuf == nil)
+			if(pcbuf == nil && runtime·showframe(nil, gp == m->curg))
 				runtime·printf("----- stack segment boundary -----\n");
 			stk = (Stktop*)stk->stackbase;
 			continue;
 		}
 		
-		if(pc <= 0x1000 || (f = runtime·findfunc(pc)) == nil) {
-			// Dangerous, but worthwhile: see if this is a closure by
-			// decoding the instruction stream.
-			//
-			// We check p < p+4 to avoid wrapping and faulting if
-			// we have lost track of where we are.
-			p = (byte*)pc;
-			if((pc&3) == 0 && p < p+4 &&
-			   runtime·mheap.arena_start < p &&
-			   p+4 < runtime·mheap.arena_used) {
-			   	x = *(uintptr*)p;
-				if((x&0xfffff000) == 0xe49df000) {
-					// End of closure:
-					// MOVW.P frame(R13), R15
-					pc = *(uintptr*)sp;
-					lr = 0;
-					sp += x & 0xfff;
-					fp = nil;
-					continue;
-				}
-				if((x&0xfffff000) == 0xe52de000 && lr == (uintptr)runtime·goexit) {
-					// Beginning of closure.
-					// Closure at top of stack, not yet started.
-					p += 5*4;
-					if((x&0xfff) != 4) {
-						// argument copying
-						p += 7*4;
-					}
-					if((byte*)pc < p && p < p+4 && p+4 < runtime·mheap.arena_used) {
-						pc = *(uintptr*)p;
-						fp = nil;
-						continue;
-					}
-				}
-			}
+		if(pc <= 0x1000 || (f = runtime·findfunc(pc)) == nil)
 			break;
-		}
 		
 		// Found an actual function.
 		if(lr == 0)
@@ -118,15 +83,17 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		else if(pcbuf != nil)
 			pcbuf[n++] = pc;
 		else {
-			if(runtime·showframe(f)) {
+			if(runtime·showframe(f, gp == m->curg)) {
 				// Print during crash.
 				//	main(0x1, 0x2, 0x3)
 				//		/home/rsc/go/src/runtime/x.go:23 +0xf
 				tracepc = pc;	// back up to CALL instruction for funcline.
 				if(n > 0 && pc > f->entry && !waspanic)
 					tracepc -= sizeof(uintptr);
+				if(m->throwing && gp == m->curg)
+					runtime·printf("[fp=%p] ", fp);
 				runtime·printf("%S(", f->name);
-				for(i = 0; i < f->args; i++) {
+				for(i = 0; i < f->args/sizeof(uintptr); i++) {
 					if(i != 0)
 						runtime·prints(", ");
 					runtime·printhex(((uintptr*)fp)[1+i]);
@@ -146,22 +113,22 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		
 		waspanic = f->entry == (uintptr)runtime·sigpanic;
 
-		if(pcbuf == nil && f->entry == (uintptr)runtime·newstack && g == m->g0) {
-			runtime·printf("----- newstack called from goroutine %d -----\n", m->curg->goid);
+		if(pcbuf == nil && f->entry == (uintptr)runtime·newstack && gp == m->g0) {
+			runtime·printf("----- newstack called from goroutine %D -----\n", m->curg->goid);
 			pc = (uintptr)m->morepc;
 			sp = (byte*)m->moreargp - sizeof(void*);
 			lr = (uintptr)m->morebuf.pc;
-			fp = m->morebuf.sp;
-			g = m->curg;
-			stk = (Stktop*)g->stackbase;
+			fp = (byte*)m->morebuf.sp;
+			gp = m->curg;
+			stk = (Stktop*)gp->stackbase;
 			continue;
 		}
 		
-		if(pcbuf == nil && f->entry == (uintptr)runtime·lessstack && g == m->g0) {
-			runtime·printf("----- lessstack called from goroutine %d -----\n", m->curg->goid);
-			g = m->curg;
-			stk = (Stktop*)g->stackbase;
-			sp = stk->gobuf.sp;
+		if(pcbuf == nil && f->entry == (uintptr)runtime·lessstack && gp == m->g0) {
+			runtime·printf("----- lessstack called from goroutine %D -----\n", m->curg->goid);
+			gp = m->curg;
+			stk = (Stktop*)gp->stackbase;
+			sp = (byte*)stk->gobuf.sp;
 			pc = (uintptr)stk->gobuf.pc;
 			fp = nil;
 			lr = 0;
@@ -182,9 +149,21 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		// If this was deferproc or newproc, the caller had an extra 12.
 		if(f->entry == (uintptr)runtime·deferproc || f->entry == (uintptr)runtime·newproc)
 			sp += 12;
+
+		// sighandler saves the lr on stack before faking a call to sigpanic
+		if(waspanic) {
+			x = *(uintptr *)sp;
+			sp += 4;
+			f = runtime·findfunc(pc);
+			if (f == nil) {
+				pc = x;
+			} else if (f->frame == 0)
+				lr = x;
+		}
 	}
 	
-	if(pcbuf == nil && (pc = g->gopc) != 0 && (f = runtime·findfunc(pc)) != nil && g->goid != 1) {
+	if(pcbuf == nil && (pc = gp->gopc) != 0 && (f = runtime·findfunc(pc)) != nil
+			&& runtime·showframe(f, gp == m->curg) && gp->goid != 1) {
 		runtime·printf("created by %S\n", f->name);
 		tracepc = pc;	// back up to CALL instruction for funcline.
 		if(n > 0 && pc > f->entry)
@@ -199,9 +178,15 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 }
 
 void
-runtime·traceback(byte *pc0, byte *sp, byte *lr, G *g)
+runtime·traceback(byte *pc0, byte *sp, byte *lr, G *gp)
 {
-	runtime·gentraceback(pc0, sp, lr, g, 0, nil, 100);
+	if(gp->status == Gsyscall) {
+		// Override signal registers if blocked in system call.
+		pc0 = gp->sched.pc;
+		sp = (byte*)gp->sched.sp;
+		lr = nil;
+	}
+	runtime·gentraceback(pc0, sp, lr, gp, 0, nil, 100);
 }
 
 // func caller(n int) (pc uintptr, file string, line int, ok bool)
diff --git a/src/pkg/runtime/traceback_x86.c b/src/pkg/runtime/traceback_x86.c
index be35bab00..72603ae8e 100644
--- a/src/pkg/runtime/traceback_x86.c
+++ b/src/pkg/runtime/traceback_x86.c
@@ -8,7 +8,6 @@
 #include "arch_GOARCH.h"
 #include "malloc.h"
 
-static uintptr isclosureentry(uintptr);
 void runtime·deferproc(void);
 void runtime·newproc(void);
 void runtime·newstack(void);
@@ -23,9 +22,8 @@ void runtime·sigpanic(void);
 // A little clunky to merge the two but avoids duplicating
 // the code and all its subtlety.
 int32
-runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr *pcbuf, int32 max)
+runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *gp, int32 skip, uintptr *pcbuf, int32 max)
 {
-	byte *p;
 	int32 i, n, iter, sawnewstack;
 	uintptr pc, lr, tracepc;
 	byte *fp;
@@ -40,29 +38,22 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 	waspanic = false;
 	
 	// If the PC is goexit, the goroutine hasn't started yet.
-	if(pc0 == g->sched.pc && sp == g->sched.sp && pc0 == (byte*)runtime·goexit) {
+	if(pc0 == gp->sched.pc && sp == (byte*)gp->sched.sp && pc0 == (byte*)runtime·goexit && gp->fnstart != nil) {
 		fp = sp;
 		lr = pc;
-		pc = (uintptr)g->entry;
+		pc = (uintptr)gp->fnstart->fn;
 	}
 	
 	// If the PC is zero, it's likely a nil function call.
 	// Start in the caller's frame.
 	if(pc == 0) {
-		pc = lr;
-		lr = 0;
-	}
-
-	// If the PC is zero, it's likely a nil function call.
-	// Start in the caller's frame.
-	if(pc == 0) {
 		pc = *(uintptr*)sp;
 		sp += sizeof(uintptr);
 	}
 
 	n = 0;
 	sawnewstack = 0;
-	stk = (Stktop*)g->stackbase;
+	stk = (Stktop*)gp->stackbase;
 	for(iter = 0; iter < 100 && n < max; iter++) {	// iter avoids looping forever
 		// Typically:
 		//	pc is the PC of the running function.
@@ -74,41 +65,16 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		if(pc == (uintptr)runtime·lessstack) {
 			// Hit top of stack segment.  Unwind to next segment.
 			pc = (uintptr)stk->gobuf.pc;
-			sp = stk->gobuf.sp;
+			sp = (byte*)stk->gobuf.sp;
 			lr = 0;
 			fp = nil;
-			if(pcbuf == nil)
+			if(pcbuf == nil && runtime·showframe(nil, gp == m->curg))
 				runtime·printf("----- stack segment boundary -----\n");
 			stk = (Stktop*)stk->stackbase;
 			continue;
 		}
-		if(pc <= 0x1000 || (f = runtime·findfunc(pc)) == nil) {
-			// Dangerous, but worthwhile: see if this is a closure:
-			//	ADDQ $wwxxyyzz, SP; RET
-			//	[48] 81 c4 zz yy xx ww c3
-			// The 0x48 byte is only on amd64.
-			p = (byte*)pc;
-			// We check p < p+8 to avoid wrapping and faulting if we lose track.
-			if(runtime·mheap.arena_start < p && p < p+8 && p+8 < runtime·mheap.arena_used &&  // pointer in allocated memory
-			   (sizeof(uintptr) != 8 || *p++ == 0x48) &&  // skip 0x48 byte on amd64
-			   p[0] == 0x81 && p[1] == 0xc4 && p[6] == 0xc3) {
-				sp += *(uint32*)(p+2);
-				pc = *(uintptr*)sp;
-				sp += sizeof(uintptr);
-				lr = 0;
-				fp = nil;
-				continue;
-			}
-			
-			// Closure at top of stack, not yet started.
-			if(lr == (uintptr)runtime·goexit && (pc = isclosureentry(pc)) != 0) {
-				fp = sp;
-				continue;
-			}
-
-			// Unknown pc: stop.
+		if(pc <= 0x1000 || (f = runtime·findfunc(pc)) == nil)
 			break;
-		}
 
 		// Found an actual function.
 		if(fp == nil) {
@@ -126,7 +92,7 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		else if(pcbuf != nil)
 			pcbuf[n++] = pc;
 		else {
-			if(runtime·showframe(f)) {
+			if(runtime·showframe(f, gp == m->curg)) {
 				// Print during crash.
 				//	main(0x1, 0x2, 0x3)
 				//		/home/rsc/go/src/runtime/x.go:23 +0xf
@@ -134,8 +100,10 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 				tracepc = pc;	// back up to CALL instruction for funcline.
 				if(n > 0 && pc > f->entry && !waspanic)
 					tracepc--;
+				if(m->throwing && gp == m->curg)
+					runtime·printf("[fp=%p] ", fp);
 				runtime·printf("%S(", f->name);
-				for(i = 0; i < f->args; i++) {
+				for(i = 0; i < f->args/sizeof(uintptr); i++) {
 					if(i != 0)
 						runtime·prints(", ");
 					runtime·printhex(((uintptr*)fp)[i]);
@@ -161,27 +129,27 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 		if(f->entry == (uintptr)runtime·newstack)
 			sawnewstack = 1;
 
-		if(pcbuf == nil && f->entry == (uintptr)runtime·morestack && g == m->g0 && sawnewstack) {
+		if(pcbuf == nil && f->entry == (uintptr)runtime·morestack && gp == m->g0 && sawnewstack) {
 			// The fact that we saw newstack means that morestack
 			// has managed to record its information in m, so we can
 			// use it to keep unwinding the stack.
-			runtime·printf("----- morestack called from goroutine %d -----\n", m->curg->goid);
+			runtime·printf("----- morestack called from goroutine %D -----\n", m->curg->goid);
 			pc = (uintptr)m->morepc;
-			sp = m->morebuf.sp - sizeof(void*);
+			sp = (byte*)m->morebuf.sp - sizeof(void*);
 			lr = (uintptr)m->morebuf.pc;
-			fp = m->morebuf.sp;
+			fp = (byte*)m->morebuf.sp;
 			sawnewstack = 0;
-			g = m->curg;
-			stk = (Stktop*)g->stackbase;
+			gp = m->curg;
+			stk = (Stktop*)gp->stackbase;
 			continue;
 		}
 
-		if(pcbuf == nil && f->entry == (uintptr)runtime·lessstack && g == m->g0) {
+		if(pcbuf == nil && f->entry == (uintptr)runtime·lessstack && gp == m->g0) {
 			// Lessstack is running on scheduler stack.  Switch to original goroutine.
-			runtime·printf("----- lessstack called from goroutine %d -----\n", m->curg->goid);
-			g = m->curg;
-			stk = (Stktop*)g->stackbase;
-			sp = stk->gobuf.sp;
+			runtime·printf("----- lessstack called from goroutine %D -----\n", m->curg->goid);
+			gp = m->curg;
+			stk = (Stktop*)gp->stackbase;
+			sp = (byte*)stk->gobuf.sp;
 			pc = (uintptr)stk->gobuf.pc;
 			fp = nil;
 			lr = 0;
@@ -196,7 +164,8 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 	}
 	
 	// Show what created goroutine, except main goroutine (goid 1).
-	if(pcbuf == nil && (pc = g->gopc) != 0 && (f = runtime·findfunc(pc)) != nil && g->goid != 1) {
+	if(pcbuf == nil && (pc = gp->gopc) != 0 && (f = runtime·findfunc(pc)) != nil
+			&& runtime·showframe(f, gp == m->curg) && gp->goid != 1) {
 		runtime·printf("created by %S\n", f->name);
 		tracepc = pc;	// back up to CALL instruction for funcline.
 		if(n > 0 && pc > f->entry)
@@ -211,9 +180,14 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 }
 
 void
-runtime·traceback(byte *pc0, byte *sp, byte*, G *g)
+runtime·traceback(byte *pc0, byte *sp, byte*, G *gp)
 {
-	runtime·gentraceback(pc0, sp, nil, g, 0, nil, 100);
+	if(gp->status == Gsyscall) {
+		// Override signal registers if blocked in system call.
+		pc0 = gp->sched.pc;
+		sp = (byte*)gp->sched.sp;
+	}
+	runtime·gentraceback(pc0, sp, nil, gp, 0, nil, 100);
 }
 
 int32
@@ -227,77 +201,3 @@ runtime·callers(int32 skip, uintptr *pcbuf, int32 m)
 
 	return runtime·gentraceback(pc, sp, nil, g, skip, pcbuf, m);
 }
-
-static uintptr
-isclosureentry(uintptr pc)
-{
-	byte *p;
-	int32 i, siz;
-	
-	p = (byte*)pc;
-	if(p < runtime·mheap.arena_start || p+32 > runtime·mheap.arena_used)
-		return 0;
-
-	if(*p == 0xe8) {
-		// CALL fn
-		return pc+5+*(int32*)(p+1);
-	}
-	
-	if(sizeof(uintptr) == 8 && p[0] == 0x48 && p[1] == 0xb9 && p[10] == 0xff && p[11] == 0xd1) {
-		// MOVQ $fn, CX; CALL *CX
-		return *(uintptr*)(p+2);
-	}
-
-	// SUBQ $siz, SP
-	if((sizeof(uintptr) == 8 && *p++ != 0x48) || *p++ != 0x81 || *p++ != 0xec)
-		return 0;
-	siz = *(uint32*)p;
-	p += 4;
-	
-	// MOVQ $q, SI
-	if((sizeof(uintptr) == 8 && *p++ != 0x48) || *p++ != 0xbe)
-		return 0;
-	p += sizeof(uintptr);
-
-	// MOVQ SP, DI
-	if((sizeof(uintptr) == 8 && *p++ != 0x48) || *p++ != 0x89 || *p++ != 0xe7)
-		return 0;
-
-	// CLD on 32-bit
-	if(sizeof(uintptr) == 4 && *p++ != 0xfc)
-		return 0;
-
-	if(siz <= 4*sizeof(uintptr)) {
-		// MOVSQ...
-		for(i=0; i<siz; i+=sizeof(uintptr))
-			if((sizeof(uintptr) == 8 && *p++ != 0x48) || *p++ != 0xa5)
-				return 0;
-	} else {
-		// MOVQ $(siz/8), CX  [32-bit immediate siz/8]
-		if((sizeof(uintptr) == 8 && *p++ != 0x48) || *p++ != 0xc7 || *p++ != 0xc1)
-			return 0;
-		p += 4;
-		
-		// REP MOVSQ
-		if(*p++ != 0xf3 || (sizeof(uintptr) == 8 && *p++ != 0x48) || *p++ != 0xa5)
-			return 0;
-	}
-	
-	// CALL fn
-	if(*p == 0xe8) {
-		p++;
-		return (uintptr)p+4 + *(int32*)p;
-	}
-	
-	// MOVQ $fn, CX; CALL *CX
-	if(sizeof(uintptr) != 8 || *p++ != 0x48 || *p++ != 0xb9)
-		return 0;
-
-	pc = *(uintptr*)p;
-	p += 8;
-	
-	if(*p++ != 0xff || *p != 0xd1)
-		return 0;
-
-	return pc;
-}
diff --git a/src/pkg/runtime/type.go b/src/pkg/runtime/type.go
index 6af6b237f..374754afa 100644
--- a/src/pkg/runtime/type.go
+++ b/src/pkg/runtime/type.go
@@ -14,24 +14,25 @@ package runtime
 
 import "unsafe"
 
-type commonType struct {
+type rtype struct {
 	size       uintptr
 	hash       uint32
 	_          uint8
 	align      uint8
 	fieldAlign uint8
 	kind       uint8
-	alg        *uintptr
+	alg        unsafe.Pointer
+	gc         unsafe.Pointer
 	string     *string
 	*uncommonType
-	ptrToThis *interface{}
+	ptrToThis *rtype
 }
 
 type _method struct {
 	name    *string
 	pkgPath *string
-	mtyp    *interface{}
-	typ     *interface{}
+	mtyp    *rtype
+	typ     *rtype
 	ifn     unsafe.Pointer
 	tfn     unsafe.Pointer
 }
@@ -45,10 +46,10 @@ type uncommonType struct {
 type _imethod struct {
 	name    *string
 	pkgPath *string
-	typ     *interface{}
+	typ     *rtype
 }
 
 type interfaceType struct {
-	commonType
+	rtype
 	methods []_imethod
 }
diff --git a/src/pkg/runtime/type.h b/src/pkg/runtime/type.h
index c1d9facd1..769a8071b 100644
--- a/src/pkg/runtime/type.h
+++ b/src/pkg/runtime/type.h
@@ -5,21 +5,20 @@
 /*
  * Runtime type representation; master is type.go
  *
- * The *Types here correspond 1-1 to type.go's *Type's, but are
- * prefixed with an extra header of 2 pointers, corresponding to the
- * interface{} structure, which itself is called type Type again on
- * the Go side.
+ * The Type*s here correspond 1-1 to type.go's *rtype.
  */
 
-typedef struct CommonType CommonType;
+typedef struct Type Type;
 typedef struct UncommonType UncommonType;
 typedef struct InterfaceType InterfaceType;
 typedef struct Method Method;
 typedef struct IMethod IMethod;
 typedef struct SliceType SliceType;
 typedef struct FuncType FuncType;
+typedef struct PtrType PtrType;
 
-struct CommonType
+// Needs to be in sync with typekind.h/CommonSize
+struct Type
 {
 	uintptr size;
 	uint32 hash;
@@ -28,42 +27,12 @@ struct CommonType
 	uint8 fieldAlign;
 	uint8 kind;
 	Alg *alg;
+	void *gc;
 	String *string;
 	UncommonType *x;
 	Type *ptrto;
 };
 
-enum {
-	KindBool = 1,
-	KindInt,
-	KindInt8,
-	KindInt16,
-	KindInt32,
-	KindInt64,
-	KindUint,
-	KindUint8,
-	KindUint16,
-	KindUint32,
-	KindUint64,
-	KindUintptr,
-	KindFloat32,
-	KindFloat64,
-	KindComplex64,
-	KindComplex128,
-	KindArray,
-	KindChan,
-	KindFunc,
-	KindInterface,
-	KindMap,
-	KindPtr,
-	KindSlice,
-	KindString,
-	KindStruct,
-	KindUnsafePointer,
-	
-	KindNoPointers = 1<<7,
-};
-
 struct Method
 {
 	String *name;
@@ -82,13 +51,6 @@ struct UncommonType
 	Method m[];
 };
 
-struct Type
-{
-	void *type;	// interface{} value
-	void *ptr;
-	CommonType;
-};
-
 struct IMethod
 {
 	String *name;
@@ -130,3 +92,9 @@ struct FuncType
 	Slice in;
 	Slice out;
 };
+
+struct PtrType
+{
+	Type;
+	Type *elem;
+};
diff --git a/src/pkg/runtime/typekind.h b/src/pkg/runtime/typekind.h
new file mode 100644
index 000000000..9bae2a871
--- /dev/null
+++ b/src/pkg/runtime/typekind.h
@@ -0,0 +1,41 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// PtrSize vs sizeof(void*): This file is also included from src/cmd/ld/...
+// which defines PtrSize to be different from sizeof(void*) when crosscompiling.
+
+enum {
+	KindBool = 1,
+	KindInt,
+	KindInt8,
+	KindInt16,
+	KindInt32,
+	KindInt64,
+	KindUint,
+	KindUint8,
+	KindUint16,
+	KindUint32,
+	KindUint64,
+	KindUintptr,
+	KindFloat32,
+	KindFloat64,
+	KindComplex64,
+	KindComplex128,
+	KindArray,
+	KindChan,
+	KindFunc,
+	KindInterface,
+	KindMap,
+	KindPtr,
+	KindSlice,
+	KindString,
+	KindStruct,
+	KindUnsafePointer,
+
+	KindNoPointers = 1<<7,
+
+	// size of Type structure.
+	CommonSize = 6*PtrSize + 8,
+};
+
diff --git a/src/pkg/runtime/vdso_linux_amd64.c b/src/pkg/runtime/vdso_linux_amd64.c
new file mode 100644
index 000000000..ab68c23c3
--- /dev/null
+++ b/src/pkg/runtime/vdso_linux_amd64.c
@@ -0,0 +1,331 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "runtime.h"
+
+#define AT_SYSINFO_EHDR 33
+#define AT_NULL	0    /* End of vector */
+#define PT_LOAD	1    /* Loadable program segment */
+#define PT_DYNAMIC 2 /* Dynamic linking information */
+#define DT_NULL 0    /* Marks end of dynamic section */
+#define DT_STRTAB 5  /* Address of string table */
+#define DT_SYMTAB 6  /* Address of symbol table */
+#define DT_VERSYM 0x6ffffff0
+#define	DT_VERDEF 0x6ffffffc
+
+#define VER_FLG_BASE 0x1 /* Version definition of file itself */
+#define SHN_UNDEF 0      /* Undefined section */
+#define SHT_DYNSYM 11    /* Dynamic linker symbol table */
+#define STT_FUNC 2       /* Symbol is a code object */
+#define STB_GLOBAL 1     /* Global symbol */
+#define STB_WEAK 2       /* Weak symbol */
+
+/* How to extract and insert information held in the st_info field.  */
+#define ELF64_ST_BIND(val) (((byte) (val)) >> 4)
+#define ELF64_ST_TYPE(val) ((val) & 0xf)
+
+#define EI_NIDENT (16)
+
+typedef uint16 Elf64_Half;
+typedef uint32 Elf64_Word;
+typedef	int32  Elf64_Sword;
+typedef uint64 Elf64_Xword;
+typedef	int64  Elf64_Sxword;
+typedef uint64 Elf64_Addr;
+typedef uint64 Elf64_Off;
+typedef uint16 Elf64_Section;
+typedef Elf64_Half Elf64_Versym;
+
+
+typedef struct
+{
+	Elf64_Word st_name;
+	byte st_info;
+	byte st_other;
+	Elf64_Section st_shndx;
+	Elf64_Addr st_value;
+	Elf64_Xword st_size;
+} Elf64_Sym;
+
+typedef struct
+{
+	Elf64_Half vd_version; /* Version revision */
+	Elf64_Half vd_flags;   /* Version information */
+	Elf64_Half vd_ndx;     /* Version Index */
+	Elf64_Half vd_cnt;     /* Number of associated aux entries */
+	Elf64_Word vd_hash;    /* Version name hash value */
+	Elf64_Word vd_aux;     /* Offset in bytes to verdaux array */
+	Elf64_Word vd_next;    /* Offset in bytes to next verdef entry */
+} Elf64_Verdef;
+
+typedef struct
+{
+	byte e_ident[EI_NIDENT]; /* Magic number and other info */
+	Elf64_Half e_type;       /* Object file type */
+	Elf64_Half e_machine;    /* Architecture */
+	Elf64_Word e_version;    /* Object file version */
+	Elf64_Addr e_entry;      /* Entry point virtual address */
+	Elf64_Off e_phoff;       /* Program header table file offset */
+	Elf64_Off e_shoff;       /* Section header table file offset */
+	Elf64_Word e_flags;      /* Processor-specific flags */
+	Elf64_Half e_ehsize;     /* ELF header size in bytes */
+	Elf64_Half e_phentsize;  /* Program header table entry size */
+	Elf64_Half e_phnum;      /* Program header table entry count */
+	Elf64_Half e_shentsize;  /* Section header table entry size */
+	Elf64_Half e_shnum;      /* Section header table entry count */
+	Elf64_Half e_shstrndx;   /* Section header string table index */
+} Elf64_Ehdr;
+
+typedef struct
+{
+	Elf64_Word p_type;    /* Segment type */
+	Elf64_Word p_flags;   /* Segment flags */
+	Elf64_Off p_offset;   /* Segment file offset */
+	Elf64_Addr p_vaddr;   /* Segment virtual address */
+	Elf64_Addr p_paddr;   /* Segment physical address */
+	Elf64_Xword p_filesz; /* Segment size in file */
+	Elf64_Xword p_memsz;  /* Segment size in memory */
+	Elf64_Xword p_align;  /* Segment alignment */
+} Elf64_Phdr;
+
+typedef struct
+{
+	Elf64_Word sh_name;       /* Section name (string tbl index) */
+	Elf64_Word sh_type;       /* Section type */
+	Elf64_Xword sh_flags;     /* Section flags */
+	Elf64_Addr sh_addr;       /* Section virtual addr at execution */
+	Elf64_Off sh_offset;      /* Section file offset */
+	Elf64_Xword sh_size;      /* Section size in bytes */
+	Elf64_Word sh_link;       /* Link to another section */
+	Elf64_Word sh_info;       /* Additional section information */
+	Elf64_Xword sh_addralign; /* Section alignment */
+	Elf64_Xword sh_entsize;   /* Entry size if section holds table */
+} Elf64_Shdr;
+
+typedef struct
+{
+	Elf64_Sxword d_tag; /* Dynamic entry type */
+	union
+	{
+		Elf64_Xword d_val;  /* Integer value */
+		Elf64_Addr d_ptr;   /* Address value */
+	} d_un;
+} Elf64_Dyn;
+
+typedef struct
+{
+	Elf64_Word vda_name; /* Version or dependency names */
+	Elf64_Word vda_next; /* Offset in bytes to next verdaux entry */
+} Elf64_Verdaux;
+
+typedef struct
+{
+	uint64 a_type;        /* Entry type */
+	union
+	{
+		uint64 a_val; /* Integer value */
+	} a_un;
+} Elf64_auxv_t;
+
+
+typedef struct {
+	byte* name;
+	void** var_ptr;
+} symbol_key;
+
+typedef struct {
+	byte* version;
+	int32 ver_hash;
+} version_key;
+
+struct vdso_info {
+	bool valid;
+
+	/* Load information */
+	uintptr load_addr;
+	uintptr load_offset;  /* load_addr - recorded vaddr */
+
+	/* Symbol table */
+	int32 num_sym;
+	Elf64_Sym *symtab;
+	const byte *symstrings;
+
+	/* Version table */
+	Elf64_Versym *versym;
+	Elf64_Verdef *verdef;
+};
+
+static version_key linux26 = { (byte*)"LINUX_2.6", 0x3ae75f6 };
+
+// initialize with vsyscall fallbacks
+void* runtime·__vdso_time_sym = (void*)0xffffffffff600400ULL;
+void* runtime·__vdso_gettimeofday_sym = (void*)0xffffffffff600000ULL;
+void* runtime·__vdso_clock_gettime_sym = (void*)0;
+
+#define SYM_KEYS_COUNT 3
+static symbol_key sym_keys[] = {
+	{ (byte*)"__vdso_time", &runtime·__vdso_time_sym },
+	{ (byte*)"__vdso_gettimeofday", &runtime·__vdso_gettimeofday_sym },
+	{ (byte*)"__vdso_clock_gettime", &runtime·__vdso_clock_gettime_sym },
+};
+
+static void
+vdso_init_from_sysinfo_ehdr(struct vdso_info *vdso_info, Elf64_Ehdr* hdr)
+{
+	uint64 i;
+	bool found_vaddr = false;
+
+	vdso_info->load_addr = (uintptr) hdr;
+
+	Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info->load_addr + hdr->e_phoff);
+	Elf64_Shdr *sh = (Elf64_Shdr*)(vdso_info->load_addr + hdr->e_shoff);
+	Elf64_Dyn *dyn = 0;
+
+	for(i=0; i<hdr->e_shnum; i++) {
+		if(sh[i].sh_type == SHT_DYNSYM) {
+			vdso_info->num_sym = sh[i].sh_size / sizeof(Elf64_Sym);
+		}
+	}
+
+	// We need two things from the segment table: the load offset
+	// and the dynamic table.
+	for(i=0; i<hdr->e_phnum; i++) {
+		if(pt[i].p_type == PT_LOAD && found_vaddr == false) {
+			found_vaddr = true;
+			vdso_info->load_offset =	(uintptr)hdr
+				+ (uintptr)pt[i].p_offset
+				- (uintptr)pt[i].p_vaddr;
+		} else if(pt[i].p_type == PT_DYNAMIC) {
+			dyn = (Elf64_Dyn*)((uintptr)hdr + pt[i].p_offset);
+		}
+	}
+
+	if(found_vaddr == false || dyn == nil)
+		return;  // Failed
+
+	// Fish out the useful bits of the dynamic table.
+	for(i=0; dyn[i].d_tag!=DT_NULL; i++) {
+		switch(dyn[i].d_tag) {
+		case DT_STRTAB:
+			vdso_info->symstrings = (const byte *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		case DT_SYMTAB:
+			vdso_info->symtab = (Elf64_Sym *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		case DT_VERSYM:
+			vdso_info->versym = (Elf64_Versym *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		case DT_VERDEF:
+			vdso_info->verdef = (Elf64_Verdef *)
+				((uintptr)dyn[i].d_un.d_ptr
+				 + vdso_info->load_offset);
+			break;
+		}
+	}
+	if(vdso_info->symstrings == nil || vdso_info->symtab == nil)
+		return;  // Failed
+
+	if(vdso_info->verdef == nil)
+		vdso_info->versym = 0;
+
+	// That's all we need.
+	vdso_info->valid = true;
+}
+
+static int32
+vdso_find_version(struct vdso_info *vdso_info, version_key* ver)
+{
+	if(vdso_info->valid == false) {
+		return 0;
+	}
+	Elf64_Verdef *def = vdso_info->verdef;
+	while(true) {
+		if((def->vd_flags & VER_FLG_BASE) == 0) {
+			Elf64_Verdaux *aux = (Elf64_Verdaux*)((byte *)def + def->vd_aux);
+			if(def->vd_hash == ver->ver_hash &&
+				runtime·strcmp(ver->version, vdso_info->symstrings + aux->vda_name) == 0) {
+				return def->vd_ndx & 0x7fff;
+			}
+		}
+
+		if(def->vd_next == 0) {
+			break;
+		}
+		def = (Elf64_Verdef *)((byte *)def + def->vd_next);
+	}
+	return 0;
+}
+
+static void
+vdso_parse_symbols(struct vdso_info *vdso_info, int32 version)
+{
+	int32 i, j;
+
+	if(vdso_info->valid == false)
+		return;
+
+	for(i=0; i<vdso_info->num_sym; i++) {
+		Elf64_Sym *sym = &vdso_info->symtab[i];
+
+		// Check for a defined global or weak function w/ right name.
+		if(ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
+			continue;
+		if(ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
+			ELF64_ST_BIND(sym->st_info) != STB_WEAK)
+			continue;
+		if(sym->st_shndx == SHN_UNDEF)
+			continue;
+
+		for(j=0; j<SYM_KEYS_COUNT; j++) {
+			if(runtime·strcmp(sym_keys[j].name, vdso_info->symstrings + sym->st_name) != 0)
+				continue;
+
+			// Check symbol version.
+			if(vdso_info->versym != nil && version != 0
+				&& vdso_info->versym[i] & 0x7fff != version)
+				continue;
+
+			*sym_keys[j].var_ptr = (void *)(vdso_info->load_offset + sym->st_value);
+		}
+	}
+}
+
+static void
+runtime·linux_setup_vdso(int32 argc, uint8** argv)
+{
+	struct vdso_info vdso_info;
+
+	// skip argvc
+	byte **p = argv;
+	p = &p[argc+1];
+
+	// skip envp to get to ELF auxiliary vector.
+	for(; *p!=0; p++) {}
+
+	// skip NULL separator
+	p++;
+
+	// now, p points to auxv
+	Elf64_auxv_t *elf_auxv = (Elf64_auxv_t*) p;
+
+	for(int32 i=0; elf_auxv[i].a_type!=AT_NULL; i++) {
+		if(elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
+			if(elf_auxv[i].a_un.a_val == 0) {
+				// Something went wrong
+				return;
+			}
+			vdso_init_from_sysinfo_ehdr(&vdso_info, (Elf64_Ehdr*)elf_auxv[i].a_un.a_val);
+			vdso_parse_symbols(&vdso_info, vdso_find_version(&vdso_info, &linux26));
+			return;
+		}
+	}
+}
+
+void (*runtime·sysargs)(int32, uint8**) = runtime·linux_setup_vdso;
diff --git a/src/pkg/runtime/vlop_arm.s b/src/pkg/runtime/vlop_arm.s
index fc679f0ee..0dedc316a 100644
--- a/src/pkg/runtime/vlop_arm.s
+++ b/src/pkg/runtime/vlop_arm.s
@@ -23,9 +23,6 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#define UMULL(Rs,Rm,Rhi,Rlo,S)  WORD	 $((14<<28)|(4<<21)|(S<<20)|(Rhi<<16)|(Rlo<<12)|(Rs<<8)|(9<<4)|Rm)
-#define UMLAL(Rs,Rm,Rhi,Rlo,S)  WORD	 $((14<<28)|(5<<21)|(S<<20)|(Rhi<<16)|(Rlo<<12)|(Rs<<8)|(9<<4)|Rm)
-#define MUL(Rs,Rm,Rd,S) WORD	 $((14<<28)|(0<<21)|(S<<20)|(Rd<<16)|(Rs<<8)|(9<<4)|Rm)
 arg=0
 
 /* replaced use of R10 by R11 because the former can be the data segment base register */
@@ -36,136 +33,15 @@ TEXT _mulv(SB), $0
 	MOVW	8(FP), R11	/* h0 */
 	MOVW	12(FP), R4	/* l1 */
 	MOVW	16(FP), R5	/* h1 */
-	UMULL(4, 2, 7, 6, 0)
-	MUL(11, 4, 8, 0)
+	MULLU	R4, R2, (R7,R6)
+	MUL	R11, R4, R8
 	ADD	R8, R7
-	MUL(2, 5, 8, 0)
+	MUL	R2, R5, R8
 	ADD	R8, R7
 	MOVW	R6, 0(R(arg))
 	MOVW	R7, 4(R(arg))
 	RET
 
-
-Q	= 0
-N	= 1
-D	= 2
-CC	= 3
-TMP	= 11
-
-TEXT save<>(SB), 7, $0
-	MOVW	R(Q), 0(FP)
-	MOVW	R(N), 4(FP)
-	MOVW	R(D), 8(FP)
-	MOVW	R(CC), 12(FP)
-
-	MOVW	R(TMP), R(Q)		/* numerator */
-	MOVW	20(FP), R(D)		/* denominator */
-	CMP	$0, R(D)
-	BNE	s1
-	BL	runtime·panicdivide(SB)
-/*	  MOVW	-1(R(D)), R(TMP)	/* divide by zero fault */
-s1:	 RET
-
-TEXT rest<>(SB), 7, $0
-	MOVW	0(FP), R(Q)
-	MOVW	4(FP), R(N)
-	MOVW	8(FP), R(D)
-	MOVW	12(FP), R(CC)
-/*
- * return to caller
- * of rest<>
- */
-	MOVW	0(R13), R14
-	ADD	$20, R13
-	B	(R14)
-
-TEXT div<>(SB), 7, $0
-	MOVW	$32, R(CC)
-/*
- * skip zeros 8-at-a-time
- */
-e1:
-	AND.S	$(0xff<<24),R(Q), R(N)
-	BNE	e2
-	SLL	$8, R(Q)
-	SUB.S	$8, R(CC)
-	BNE	e1
-	RET
-e2:
-	MOVW	$0, R(N)
-
-loop:
-/*
- * shift R(N||Q) left one
- */
-	SLL	$1, R(N)
-	CMP	$0, R(Q)
-	ORR.LT  $1, R(N)
-	SLL	$1, R(Q)
-
-/*
- * compare numerator to denominator
- * if less, subtract and set quotient bit
- */
-	CMP	R(D), R(N)
-	ORR.HS  $1, R(Q)
-	SUB.HS  R(D), R(N)
-	SUB.S	$1, R(CC)
-	BNE	loop
-	RET
-
-TEXT _div(SB), 7, $16
-	BL	save<>(SB)
-	CMP	$0, R(Q)
-	BGE	d1
-	RSB	$0, R(Q), R(Q)
-	CMP	$0, R(D)
-	BGE	d2
-	RSB	$0, R(D), R(D)
-d0:
-	BL	div<>(SB)			/* none/both neg */
-	MOVW	R(Q), R(TMP)
-	B	out
-d1:
-	CMP	$0, R(D)
-	BGE	d0
-	RSB	$0, R(D), R(D)
-d2:
-	BL	div<>(SB)			/* one neg */
-	RSB	$0, R(Q), R(TMP)
-	B	out
-
-TEXT _mod(SB), 7, $16
-	BL	save<>(SB)
-	CMP	$0, R(D)
-	RSB.LT	$0, R(D), R(D)
-	CMP	$0, R(Q)
-	BGE	m1
-	RSB	$0, R(Q), R(Q)
-	BL	div<>(SB)			/* neg numerator */
-	RSB	$0, R(N), R(TMP)
-	B	out
-m1:
-	BL	div<>(SB)			/* pos numerator */
-	MOVW	R(N), R(TMP)
-	B	out
-
-TEXT _divu(SB), 7, $16
-	BL	save<>(SB)
-	BL	div<>(SB)
-	MOVW	R(Q), R(TMP)
-	B	out
-
-TEXT _modu(SB), 7, $16
-	BL	save<>(SB)
-	BL	div<>(SB)
-	MOVW	R(N), R(TMP)
-	B	out
-
-out:
-	BL	rest<>(SB)
-	B	out
-
 // trampoline for _sfloat2. passes LR as arg0 and
 // saves registers R0-R13 and CPSR on the stack. R0-R12 and CPSR flags can
 // be changed by _sfloat2.
@@ -186,5 +62,189 @@ TEXT _sfloat(SB), 7, $64 // 4 arg + 14*4 saved regs + cpsr
 	MOVM.IA.W	(R0), [R1-R12]
 	MOVW	8(R13), R0
 	RET
-			
 
+// func udiv(n, d uint32) (q, r uint32)
+// Reference: 
+// Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
+// Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740
+q = 0 // input d, output q
+r = 1 // input n, output r
+s = 2 // three temporary variables
+m = 3
+a = 11
+// Please be careful when changing this, it is pretty fragile:
+// 1, don't use unconditional branch as the linker is free to reorder the blocks;
+// 2. if a == 11, beware that the linker will use R11 if you use certain instructions.
+TEXT udiv<>(SB),7,$-4
+	CLZ 	R(q), R(s) // find normalizing shift
+	MOVW.S	R(q)<<R(s), R(a)
+	ADD 	R(a)>>25, PC, R(a) // most significant 7 bits of divisor
+	MOVBU.NE	(4*36-64)(R(a)), R(a) // 36 == number of inst. between fast_udiv_tab and begin
+
+begin:
+	SUB.S	$7, R(s)
+	RSB 	$0, R(q), R(m) // m = -q
+	MOVW.PL	R(a)<<R(s), R(q)
+
+	// 1st Newton iteration
+	MUL.PL	R(m), R(q), R(a) // a = -q*d
+	BMI 	udiv_by_large_d
+	MULAWT	R(a), R(q), R(q), R(q) // q approx q-(q*q*d>>32)
+	TEQ 	R(m)->1, R(m) // check for d=0 or d=1
+
+	// 2nd Newton iteration
+	MUL.NE	R(m), R(q), R(a)
+	MOVW.NE	$0, R(s)
+	MULAL.NE R(q), R(a), (R(q),R(s))
+	BEQ 	udiv_by_0_or_1
+
+	// q now accurate enough for a remainder r, 0<=r<3*d
+	MULLU	R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32	
+	ADD 	R(m), R(r), R(r) // r = n - d
+	MULA	R(m), R(q), R(r), R(r) // r = n - (q+1)*d
+
+	// since 0 <= n-q*d < 3*d; thus -d <= r < 2*d
+	CMN 	R(m), R(r) // t = r-d
+	SUB.CS	R(m), R(r), R(r) // if (t<-d || t>=0) r=r+d
+	ADD.CC	$1, R(q)
+	ADD.PL	R(m)<<1, R(r)
+	ADD.PL	$2, R(q)
+
+	// return, can't use RET here or fast_udiv_tab will be dropped during linking
+	MOVW	R14, R15
+
+udiv_by_large_d:
+	// at this point we know d>=2^(31-6)=2^25
+	SUB 	$4, R(a), R(a)
+	RSB 	$0, R(s), R(s)
+	MOVW	R(a)>>R(s), R(q)
+	MULLU	R(q), R(r), (R(q),R(s))
+	MULA	R(m), R(q), R(r), R(r)
+
+	// q now accurate enough for a remainder r, 0<=r<4*d
+	CMN 	R(r)>>1, R(m) // if(r/2 >= d)
+	ADD.CS	R(m)<<1, R(r)
+	ADD.CS	$2, R(q)
+	CMN 	R(r), R(m)
+	ADD.CS	R(m), R(r)
+	ADD.CS	$1, R(q)
+
+	// return, can't use RET here or fast_udiv_tab will be dropped during linking
+	MOVW	R14, R15
+
+udiv_by_0_or_1:
+	// carry set if d==1, carry clear if d==0
+	MOVW.CS	R(r), R(q)
+	MOVW.CS	$0, R(r)
+	BL.CC 	runtime·panicdivide(SB) // no way back
+
+	// return, can't use RET here or fast_udiv_tab will be dropped during linking
+	MOVW	R14, R15
+
+fast_udiv_tab:
+	// var tab [64]byte
+	// tab[0] = 255; for i := 1; i <= 63; i++ { tab[i] = (1<<14)/(64+i) }
+	// laid out here as little-endian uint32s
+	WORD $0xf4f8fcff
+	WORD $0xe6eaedf0
+	WORD $0xdadde0e3
+	WORD $0xcfd2d4d7
+	WORD $0xc5c7cacc
+	WORD $0xbcbec0c3
+	WORD $0xb4b6b8ba
+	WORD $0xacaeb0b2
+	WORD $0xa5a7a8aa
+	WORD $0x9fa0a2a3
+	WORD $0x999a9c9d
+	WORD $0x93949697
+	WORD $0x8e8f9092
+	WORD $0x898a8c8d
+	WORD $0x85868788
+	WORD $0x81828384
+
+// The linker will pass numerator in R(TMP), and it also
+// expects the result in R(TMP)
+TMP = 11
+
+TEXT _divu(SB), 7, $16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(m), 16(R13)
+
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	BL  	udiv<>(SB)
+	MOVW	R(q), R(TMP)
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(m)
+	RET
+
+TEXT _modu(SB), 7, $16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(m), 16(R13)
+
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	BL  	udiv<>(SB)
+	MOVW	R(r), R(TMP)
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(m)
+	RET
+
+TEXT _div(SB),7,$16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(m), 16(R13)
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	CMP 	$0, R(r)
+	BGE 	d1
+	RSB 	$0, R(r), R(r)
+	CMP 	$0, R(q)
+	BGE 	d2
+	RSB 	$0, R(q), R(q)
+d0:
+	BL  	udiv<>(SB)  		/* none/both neg */
+	MOVW	R(q), R(TMP)
+	B		out
+d1:
+	CMP 	$0, R(q)
+	BGE 	d0
+	RSB 	$0, R(q), R(q)
+d2:
+	BL  	udiv<>(SB)  		/* one neg */
+	RSB		$0, R(q), R(TMP)
+	B   	out
+
+TEXT _mod(SB),7,$16
+	MOVW	R(q), 4(R13)
+	MOVW	R(r), 8(R13)
+	MOVW	R(s), 12(R13)
+	MOVW	R(m), 16(R13)
+	MOVW	R(TMP), R(r)		/* numerator */
+	MOVW	0(FP), R(q) 		/* denominator */
+	CMP 	$0, R(q)
+	RSB.LT	$0, R(q), R(q)
+	CMP 	$0, R(r)
+	BGE 	m1
+	RSB 	$0, R(r), R(r)
+	BL  	udiv<>(SB)  		/* neg numerator */
+	RSB 	$0, R(r), R(TMP)
+	B   	out
+m1:
+	BL  	udiv<>(SB)  		/* pos numerator */
+	MOVW	R(r), R(TMP)
+out:
+	MOVW	4(R13), R(q)
+	MOVW	8(R13), R(r)
+	MOVW	12(R13), R(s)
+	MOVW	16(R13), R(m)
+	RET
diff --git a/src/pkg/runtime/vlop_arm_test.go b/src/pkg/runtime/vlop_arm_test.go
new file mode 100644
index 000000000..cd28419ad
--- /dev/null
+++ b/src/pkg/runtime/vlop_arm_test.go
@@ -0,0 +1,70 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import "testing"
+
+// arm soft division benchmarks adapted from
+// http://ridiculousfish.com/files/division_benchmarks.tar.gz
+
+const numeratorsSize = 1 << 21
+
+var numerators = randomNumerators()
+
+type randstate struct {
+	hi, lo uint32
+}
+
+func (r *randstate) rand() uint32 {
+	r.hi = r.hi<<16 + r.hi>>16
+	r.hi += r.lo
+	r.lo += r.hi
+	return r.hi
+}
+
+func randomNumerators() []uint32 {
+	numerators := make([]uint32, numeratorsSize)
+	random := &randstate{2147483563, 2147483563 ^ 0x49616E42}
+	for i := range numerators {
+		numerators[i] = random.rand()
+	}
+	return numerators
+}
+
+func bmUint32Div(divisor uint32, b *testing.B) {
+	var sum uint32
+	for i := 0; i < b.N; i++ {
+		sum += numerators[i&(numeratorsSize-1)] / divisor
+	}
+}
+
+func BenchmarkUint32Div7(b *testing.B)         { bmUint32Div(7, b) }
+func BenchmarkUint32Div37(b *testing.B)        { bmUint32Div(37, b) }
+func BenchmarkUint32Div123(b *testing.B)       { bmUint32Div(123, b) }
+func BenchmarkUint32Div763(b *testing.B)       { bmUint32Div(763, b) }
+func BenchmarkUint32Div1247(b *testing.B)      { bmUint32Div(1247, b) }
+func BenchmarkUint32Div9305(b *testing.B)      { bmUint32Div(9305, b) }
+func BenchmarkUint32Div13307(b *testing.B)     { bmUint32Div(13307, b) }
+func BenchmarkUint32Div52513(b *testing.B)     { bmUint32Div(52513, b) }
+func BenchmarkUint32Div60978747(b *testing.B)  { bmUint32Div(60978747, b) }
+func BenchmarkUint32Div106956295(b *testing.B) { bmUint32Div(106956295, b) }
+
+func bmUint32Mod(divisor uint32, b *testing.B) {
+	var sum uint32
+	for i := 0; i < b.N; i++ {
+		sum += numerators[i&(numeratorsSize-1)] % divisor
+	}
+}
+
+func BenchmarkUint32Mod7(b *testing.B)         { bmUint32Mod(7, b) }
+func BenchmarkUint32Mod37(b *testing.B)        { bmUint32Mod(37, b) }
+func BenchmarkUint32Mod123(b *testing.B)       { bmUint32Mod(123, b) }
+func BenchmarkUint32Mod763(b *testing.B)       { bmUint32Mod(763, b) }
+func BenchmarkUint32Mod1247(b *testing.B)      { bmUint32Mod(1247, b) }
+func BenchmarkUint32Mod9305(b *testing.B)      { bmUint32Mod(9305, b) }
+func BenchmarkUint32Mod13307(b *testing.B)     { bmUint32Mod(13307, b) }
+func BenchmarkUint32Mod52513(b *testing.B)     { bmUint32Mod(52513, b) }
+func BenchmarkUint32Mod60978747(b *testing.B)  { bmUint32Mod(60978747, b) }
+func BenchmarkUint32Mod106956295(b *testing.B) { bmUint32Mod(106956295, b) }
author	Michael Stapelberg <stapelberg@debian.org>	2013-03-04 21:27:36 +0100
committer	Michael Stapelberg <michael@stapelberg.de>	2013-03-04 21:27:36 +0100
commit	04b08da9af0c450d645ab7389d1467308cfc2db8 (patch)
tree	db247935fa4f2f94408edc3acd5d0d4f997aa0d8 /src/pkg/runtime
parent	917c5fb8ec48e22459d77e3849e6d388f93d3260 (diff)
download	golang-upstream/1.1_hg20130304.tar.gz