55 files changed, 2141 insertions, 700 deletions
diff --git a/src/pkg/runtime/386/asm.s b/src/pkg/runtime/386/asm.s
index e2cabef14..a14518839 100644
--- a/src/pkg/runtime/386/asm.s
+++ b/src/pkg/runtime/386/asm.s
@@ -28,15 +28,18 @@ TEXT _rt0_386(SB),7,$0
 	TESTL	AX, AX
 	JZ	4(PC)
 	CALL	AX
+	// skip runtime·ldt0setup(SB) and tls test after initcgo for non-windows
 	CMPL runtime·iswindows(SB), $0
 	JEQ ok
 
+	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
+	CMPL	runtime·isplan9(SB), $1
+	JEQ	ok
+
 	// set up %gs
 	CALL	runtime·ldt0setup(SB)
 
 	// store through it, to make sure it works
-	CMPL	runtime·isplan9(SB), $1
-	JEQ	ok
 	get_tls(BX)
 	MOVL	$0x123, g(BX)
 	MOVL	runtime·tls0(SB), AX
@@ -318,6 +321,45 @@ TEXT runtime·casp(SB), 7, $0
 	MOVL	$1, AX
 	RET
 
+// uint32 xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime·xadd(SB), 7, $0
+	MOVL	4(SP), BX
+	MOVL	8(SP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	RET
+
+TEXT runtime·xchg(SB), 7, $0
+	MOVL	4(SP), BX
+	MOVL	8(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·procyield(SB),7,$0
+	MOVL	4(SP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
+TEXT runtime·atomicstorep(SB), 7, $0
+	MOVL	4(SP), BX
+	MOVL	8(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore(SB), 7, $0
+	MOVL	4(SP), BX
+	MOVL	8(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
@@ -460,12 +502,16 @@ TEXT runtime·stackcheck(SB), 7, $0
 TEXT runtime·memclr(SB),7,$0
 	MOVL	4(SP), DI		// arg 1 addr
 	MOVL	8(SP), CX		// arg 2 count
-	ADDL	$3, CX
+	MOVL	CX, BX
+	ANDL	$3, BX
 	SHRL	$2, CX
 	MOVL	$0, AX
 	CLD
 	REP
 	STOSL
+	MOVL	BX, CX
+	REP
+	STOSB
 	RET
 
 TEXT runtime·getcallerpc(SB),7,$0
diff --git a/src/pkg/runtime/386/atomic.c b/src/pkg/runtime/386/atomic.c
index c031cc4f6..a4f2a114f 100644
--- a/src/pkg/runtime/386/atomic.c
+++ b/src/pkg/runtime/386/atomic.c
@@ -10,3 +10,10 @@ runtime·atomicload(uint32 volatile* addr)
 {
 	return *addr;
 }
+
+#pragma textflag 7
+void*
+runtime·atomicloadp(void* volatile* addr)
+{
+	return *addr;
+}
diff --git a/src/pkg/runtime/386/closure.c b/src/pkg/runtime/386/closure.c
index b0d4cc41a..b4d867711 100644
--- a/src/pkg/runtime/386/closure.c
+++ b/src/pkg/runtime/386/closure.c
@@ -45,7 +45,7 @@ runtime·closure(int32 siz, byte *fn, byte *arg0)
 	q = p + n - siz;
 
 	if(siz > 0) {
-		runtime·mcpy(q, (byte*)&arg0, siz);
+		runtime·memmove(q, (byte*)&arg0, siz);
 
 		// SUBL $siz, SP
 		*p++ = 0x81;
diff --git a/src/pkg/runtime/386/memmove.s b/src/pkg/runtime/386/memmove.s
index 471553ba2..203a8187c 100644
--- a/src/pkg/runtime/386/memmove.s
+++ b/src/pkg/runtime/386/memmove.s
@@ -27,9 +27,6 @@ TEXT runtime·memmove(SB), 7, $0
 	MOVL	to+0(FP), DI
 	MOVL	fr+4(FP), SI
 	MOVL	n+8(FP), BX
-	CMPL	BX, $0
-	JLT	fault
-
 /*
  * check and set for backwards
  */
@@ -87,12 +84,3 @@ back:
 	MOVL	to+0(FP),AX
 	RET
 
-/*
- * if called with negative count,
- * treat as error rather than
- * rotating all of memory
- */
-fault:
-	MOVL	$0,SI
-	MOVL	0(SI), AX
-	RET
diff --git a/src/pkg/runtime/Makefile b/src/pkg/runtime/Makefile
index 03f960cb8..64bd2b771 100644
--- a/src/pkg/runtime/Makefile
+++ b/src/pkg/runtime/Makefile
@@ -120,7 +120,7 @@ $(GOARCH)/asm.h: mkasmh.sh runtime.acid.$(GOARCH)
 	mv -f $@.x $@
 
 goc2c: goc2c.c
-	quietgcc -o $@ $<
+	quietgcc -o $@ -I "$(GOROOT)/include" $< "$(GOROOT)/lib/lib9.a"
 
 mkversion: mkversion.c
 	quietgcc -o $@ -I "$(GOROOT)/include" $< "$(GOROOT)/lib/lib9.a"
diff --git a/src/pkg/runtime/amd64/asm.s b/src/pkg/runtime/amd64/asm.s
index 46d82e365..3e3818c10 100644
--- a/src/pkg/runtime/amd64/asm.s
+++ b/src/pkg/runtime/amd64/asm.s
@@ -18,7 +18,8 @@ TEXT _rt0_amd64(SB),7,$-8
 	TESTQ	AX, AX
 	JZ	needtls
 	CALL	AX
-	JMP ok
+	CMPL	runtime·iswindows(SB), $0
+	JEQ ok
 
 needtls:
 	LEAQ	runtime·tls0(SB), DI
@@ -364,6 +365,45 @@ TEXT runtime·casp(SB), 7, $0
 	MOVL	$1, AX
 	RET
 
+// uint32 xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime·xadd(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVL	16(SP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	RET
+
+TEXT runtime·xchg(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVL	16(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime·procyield(SB),7,$0
+	MOVL	8(SP), AX
+again:
+	PAUSE
+	SUBL	$1, AX
+	JNZ	again
+	RET
+
+TEXT runtime·atomicstorep(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVQ	16(SP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT runtime·atomicstore(SB), 7, $0
+	MOVQ	8(SP), BX
+	MOVL	16(SP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
@@ -413,6 +453,7 @@ TEXT runtime·asmcgocall(SB),7,$0
 	MOVQ	DI, 16(SP)	// save g
 	MOVQ	DX, 8(SP)	// save SP
 	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	MOVQ	BX, CX		// CX = first argument in Win64
 	CALL	AX
 
 	// Restore registers, g, stack pointer.
@@ -506,12 +547,16 @@ TEXT runtime·stackcheck(SB), 7, $0
 TEXT runtime·memclr(SB),7,$0
 	MOVQ	8(SP), DI		// arg 1 addr
 	MOVQ	16(SP), CX		// arg 2 count
-	ADDQ	$7, CX
+	MOVQ	CX, BX
+	ANDQ	$7, BX
 	SHRQ	$3, CX
 	MOVQ	$0, AX
 	CLD
 	REP
 	STOSQ
+	MOVQ	BX, CX
+	REP
+	STOSB
 	RET
 
 TEXT runtime·getcallerpc(SB),7,$0
diff --git a/src/pkg/runtime/amd64/atomic.c b/src/pkg/runtime/amd64/atomic.c
index c031cc4f6..a4f2a114f 100644
--- a/src/pkg/runtime/amd64/atomic.c
+++ b/src/pkg/runtime/amd64/atomic.c
@@ -10,3 +10,10 @@ runtime·atomicload(uint32 volatile* addr)
 {
 	return *addr;
 }
+
+#pragma textflag 7
+void*
+runtime·atomicloadp(void* volatile* addr)
+{
+	return *addr;
+}
diff --git a/src/pkg/runtime/amd64/closure.c b/src/pkg/runtime/amd64/closure.c
index 5033468d2..481b4a888 100644
--- a/src/pkg/runtime/amd64/closure.c
+++ b/src/pkg/runtime/amd64/closure.c
@@ -45,7 +45,7 @@ runtime·closure(int32 siz, byte *fn, byte *arg0)
 	q = p + n - siz;
 
 	if(siz > 0) {
-		runtime·mcpy(q, (byte*)&arg0, siz);
+		runtime·memmove(q, (byte*)&arg0, siz);
 
 		// SUBQ $siz, SP
 		*p++ = 0x48;
diff --git a/src/pkg/runtime/amd64/memmove.s b/src/pkg/runtime/amd64/memmove.s
index fc9573f72..e78be8145 100644
--- a/src/pkg/runtime/amd64/memmove.s
+++ b/src/pkg/runtime/amd64/memmove.s
@@ -28,8 +28,6 @@ TEXT runtime·memmove(SB), 7, $0
 	MOVQ	to+0(FP), DI
 	MOVQ	fr+8(FP), SI
 	MOVLQSX	n+16(FP), BX
-	CMPQ	BX, $0
-	JLT	fault
 
 /*
  * check and set for backwards
@@ -88,12 +86,3 @@ back:
 	MOVQ	to+0(FP),AX
 	RET
 
-/*
- * if called with negative count,
- * treat as error rather than
- * rotating all of memory
- */
-fault:
-	MOVQ	$0,SI
-	MOVQ	0(SI), AX
-	RET
diff --git a/src/pkg/runtime/amd64/traceback.c b/src/pkg/runtime/amd64/traceback.c
index d422cb692..3e85d36bd 100644
--- a/src/pkg/runtime/amd64/traceback.c
+++ b/src/pkg/runtime/amd64/traceback.c
@@ -10,6 +10,7 @@ void runtime·deferproc(void);
 void runtime·newproc(void);
 void runtime·newstack(void);
 void runtime·morestack(void);
+void runtime·sigpanic(void);
 
 // This code is also used for the 386 tracebacks.
 // Use uintptr for an appropriate word-sized integer.
@@ -27,11 +28,13 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 	byte *fp;
 	Stktop *stk;
 	Func *f;
+	bool waspanic;
 
 	USED(lr0);
 	pc = (uintptr)pc0;
 	lr = 0;
 	fp = nil;
+	waspanic = false;
 	
 	// If the PC is goexit, the goroutine hasn't started yet.
 	if(pc0 == g->sched.pc && sp == g->sched.sp && pc0 == (byte*)runtime·goexit) {
@@ -127,7 +130,7 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 			if(pc > f->entry)
 				runtime·printf("+%p", (uintptr)(pc - f->entry));
 			tracepc = pc;	// back up to CALL instruction for funcline.
-			if(n > 0 && pc > f->entry)
+			if(n > 0 && pc > f->entry && !waspanic)
 				tracepc--;
 			runtime·printf(" %S:%d\n", f->src, runtime·funcline(f, tracepc));
 			runtime·printf("\t%S(", f->name);
@@ -144,6 +147,8 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 			n++;
 		}
 		
+		waspanic = f->entry == (uintptr)runtime·sigpanic;
+
 		if(f->entry == (uintptr)runtime·deferproc || f->entry == (uintptr)runtime·newproc)
 			fp += 2*sizeof(uintptr);
 
diff --git a/src/pkg/runtime/append_test.go b/src/pkg/runtime/append_test.go
index 75a635306..b8552224e 100644
--- a/src/pkg/runtime/append_test.go
+++ b/src/pkg/runtime/append_test.go
@@ -36,7 +36,7 @@ func BenchmarkAppendSpecialCase(b *testing.B) {
 	}
 }
 
-var x = make([]int, 0, 10)
+var x []int
 
 func f() int {
 	x[:1][0] = 3
@@ -44,6 +44,7 @@ func f() int {
 }
 
 func TestSideEffectOrder(t *testing.T) {
+	x = make([]int, 0, 10)
 	x = append(x, 1, f())
 	if x[0] != 1 || x[1] != 2 {
 		t.Error("append failed: ", x[0], x[1])
diff --git a/src/pkg/runtime/arm/atomic.c b/src/pkg/runtime/arm/atomic.c
index 9fd47bae7..52e4059ae 100644
--- a/src/pkg/runtime/arm/atomic.c
+++ b/src/pkg/runtime/arm/atomic.c
@@ -4,9 +4,80 @@
 
 #include "runtime.h"
 
+// Atomic add and return new value.
+#pragma textflag 7
+uint32
+runtime·xadd(uint32 volatile *val, int32 delta)
+{
+	uint32 oval, nval;
+
+	for(;;){
+		oval = *val;
+		nval = oval + delta;
+		if(runtime·cas(val, oval, nval))
+			return nval;
+	}
+}
+
+#pragma textflag 7
+uint32
+runtime·xchg(uint32 volatile* addr, uint32 v)
+{
+	uint32 old;
+
+	for(;;) {
+		old = *addr;
+		if(runtime·cas(addr, old, v))
+			return old;
+	}
+}
+
+#pragma textflag 7
+void
+runtime·procyield(uint32 cnt)
+{
+	uint32 volatile i;
+
+	for(i = 0; i < cnt; i++) {
+	}
+}
+
 #pragma textflag 7
 uint32
 runtime·atomicload(uint32 volatile* addr)
 {
 	return runtime·xadd(addr, 0);
 }
+
+#pragma textflag 7
+void*
+runtime·atomicloadp(void* volatile* addr)
+{
+	return (void*)runtime·xadd((uint32 volatile*)addr, 0);
+}
+
+#pragma textflag 7
+void
+runtime·atomicstorep(void* volatile* addr, void* v)
+{
+	void *old;
+
+	for(;;) {
+		old = *addr;
+		if(runtime·casp(addr, old, v))
+			return;
+	}
+}
+
+#pragma textflag 7
+void
+runtime·atomicstore(uint32 volatile* addr, uint32 v)
+{
+	uint32 old;
+	
+	for(;;) {
+		old = *addr;
+		if(runtime·cas(addr, old, v))
+			return;
+	}
+}
+\ No newline at end of file
diff --git a/src/pkg/runtime/arm/closure.c b/src/pkg/runtime/arm/closure.c
index 36a93bc53..119e91b61 100644
--- a/src/pkg/runtime/arm/closure.c
+++ b/src/pkg/runtime/arm/closure.c
@@ -83,7 +83,7 @@ runtime·closure(int32 siz, byte *fn, byte *arg0)
 	*pc++ = 0xe52de000 | (siz + 4);
 
 	if(siz > 0) {
-		runtime·mcpy(q, (byte*)&arg0, siz);
+		runtime·memmove(q, (byte*)&arg0, siz);
 
 		//	MOVW	$vars(PC), R0
 		*pc = 0xe28f0000 | (int32)(q - (byte*)pc - 8);
diff --git a/src/pkg/runtime/arm/traceback.c b/src/pkg/runtime/arm/traceback.c
index c3934c37c..5628b8349 100644
--- a/src/pkg/runtime/arm/traceback.c
+++ b/src/pkg/runtime/arm/traceback.c
@@ -9,6 +9,7 @@ void runtime·deferproc(void);
 void runtime·newproc(void);
 void runtime·newstack(void);
 void runtime·morestack(void);
+void runtime·sigpanic(void);
 void _div(void);
 void _mod(void);
 void _divu(void);
@@ -20,12 +21,14 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 	int32 i, n, iter;
 	uintptr pc, lr, tracepc, x;
 	byte *fp, *p;
+	bool waspanic;
 	Stktop *stk;
 	Func *f;
 	
 	pc = (uintptr)pc0;
 	lr = (uintptr)lr0;
 	fp = nil;
+	waspanic = false;
 
 	// If the PC is goexit, the goroutine hasn't started yet.
 	if(pc == (uintptr)runtime·goexit) {
@@ -121,7 +124,7 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 			if(pc > f->entry)
 				runtime·printf("+%p", (uintptr)(pc - f->entry));
 			tracepc = pc;	// back up to CALL instruction for funcline.
-			if(n > 0 && pc > f->entry)
+			if(n > 0 && pc > f->entry && !waspanic)
 				tracepc -= sizeof(uintptr);
 			runtime·printf(" %S:%d\n", f->src, runtime·funcline(f, tracepc));
 			runtime·printf("\t%S(", f->name);
@@ -137,6 +140,8 @@ runtime·gentraceback(byte *pc0, byte *sp, byte *lr0, G *g, int32 skip, uintptr
 			runtime·prints(")\n");
 			n++;
 		}
+		
+		waspanic = f->entry == (uintptr)runtime·sigpanic;
 
 		if(pcbuf == nil && f->entry == (uintptr)runtime·newstack && g == m->g0) {
 			runtime·printf("----- newstack called from goroutine %d -----\n", m->curg->goid);
diff --git a/src/pkg/runtime/cgo/windows_amd64.c b/src/pkg/runtime/cgo/windows_amd64.c
index fd5b397ab..e8313e250 100755
--- a/src/pkg/runtime/cgo/windows_amd64.c
+++ b/src/pkg/runtime/cgo/windows_amd64.c
@@ -30,6 +30,7 @@ static void*
 threadentry(void *v)
 {
 	ThreadStart ts;
+	void *tls0;
 
 	ts = *(ThreadStart*)v;
 	free(v);
@@ -45,11 +46,13 @@ threadentry(void *v)
 	/*
 	 * Set specific keys in thread local storage.
 	 */
+	tls0 = (void*)LocalAlloc(LPTR, 64);
 	asm volatile (
+	  "movq %0, %%gs:0x58\n"	// MOVL tls0, 0x58(GS)
 	  "movq %%gs:0x58, %%rax\n" // MOVQ 0x58(GS), tmp
-	  "movq %0, 0(%%rax)\n" // MOVQ g, 0(GS)
-	  "movq %1, 8(%%rax)\n" // MOVQ m, 8(GS)
-	  :: "r"(ts.g), "r"(ts.m) : "%rax"
+	  "movq %1, 0(%%rax)\n" // MOVQ g, 0(GS)
+	  "movq %2, 8(%%rax)\n" // MOVQ m, 8(GS)
+	  :: "r"(tls0), "r"(ts.g), "r"(ts.m) : "%rax"
 	);
 
 	crosscall_amd64(ts.fn);
diff --git a/src/pkg/runtime/cgocall.c b/src/pkg/runtime/cgocall.c
index 58f287e90..829448b02 100644
--- a/src/pkg/runtime/cgocall.c
+++ b/src/pkg/runtime/cgocall.c
@@ -83,7 +83,6 @@
 // callee-save registers for gcc and returns to GoF, which returns to f.
 
 void *initcgo;	/* filled in by dynamic linker when Cgo is available */
-int64 ncgocall;
 
 static void unlockm(void);
 static void unwindm(void);
@@ -101,7 +100,7 @@ runtime·cgocall(void (*fn)(void*), void *arg)
 	if(fn == 0)
 		runtime·throw("cgocall nil");
 
-	ncgocall++;
+	m->ncgocall++;
 
 	/*
 	 * Lock g to m to ensure we stay on the same stack if we do a
@@ -155,7 +154,11 @@ unlockm(void)
 void
 runtime·Cgocalls(int64 ret)
 {
-	ret = ncgocall;
+	M *m;
+
+	ret = 0;
+	for(m=runtime·atomicloadp(&runtime·allm); m; m=m->alllink)
+		ret += m->ncgocall;
 	FLUSH(&ret);
 }
 
diff --git a/src/pkg/runtime/chan.c b/src/pkg/runtime/chan.c
index f94c3ef40..b77e51b60 100644
--- a/src/pkg/runtime/chan.c
+++ b/src/pkg/runtime/chan.c
@@ -6,6 +6,7 @@
 #include "type.h"
 
 #define	MAXALIGN	7
+#define	NOSELGEN	1
 
 static	int32	debug	= 0;
 
@@ -18,10 +19,8 @@ struct	SudoG
 {
 	G*	g;		// g and selgen constitute
 	uint32	selgen;		// a weak pointer to g
-	int16	offset;		// offset of case number
-	int8	isfree;		// offset of case number
 	SudoG*	link;
-	byte	elem[8];	// synch data element (+ more)
+	byte*	elem;		// data element
 };
 
 struct	WaitQ
@@ -38,11 +37,10 @@ struct	Hchan
 	bool	closed;
 	uint8	elemalign;
 	Alg*	elemalg;		// interface for element type
-	uint32	sendx;	// send index
-	uint32	recvx;	// receive index
+	uint32	sendx;			// send index
+	uint32	recvx;			// receive index
 	WaitQ	recvq;			// list of recv waiters
 	WaitQ	sendq;			// list of send waiters
-	SudoG*	free;			// freelist
 	Lock;
 };
 
@@ -60,34 +58,26 @@ enum
 
 struct	Scase
 {
+	SudoG	sg;			// must be first member (cast to Scase)
 	Hchan*	chan;			// chan
 	byte*	pc;			// return pc
 	uint16	kind;
 	uint16	so;			// vararg of selected bool
-	union {
-		byte	elem[2*sizeof(void*)];	// element (send)
-		struct {
-			byte*	elemp;		// pointer to element (recv)
-			bool*	receivedp;	// pointer to received bool (recv2)
-		} recv;
-	} u;
+	bool*	receivedp;		// pointer to received bool (recv2)
 };
 
 struct	Select
 {
 	uint16	tcase;			// total count of scase[]
 	uint16	ncase;			// currently filled scase[]
-	Select*	link;			// for freelist
-	uint16*	order;
-	Scase*	scase[1];		// one per case
+	uint16*	pollorder;		// case poll order
+	Hchan**	lockorder;		// channel lock order
+	Scase	scase[1];		// one per case (in order of appearance)
 };
 
-static	void	dequeueg(WaitQ*, Hchan*);
-static	SudoG*	dequeue(WaitQ*, Hchan*);
+static	void	dequeueg(WaitQ*);
+static	SudoG*	dequeue(WaitQ*);
 static	void	enqueue(WaitQ*, SudoG*);
-static	SudoG*	allocsg(Hchan*);
-static	void	freesg(Hchan*, SudoG*);
-static	uint32	fastrandn(uint32);
 static	void	destroychan(Hchan*);
 
 Hchan*
@@ -97,7 +87,7 @@ runtime·makechan_c(Type *elem, int64 hint)
 	int32 n;
 	byte *by;
 
-	if(hint < 0 || (int32)hint != hint || hint > ((uintptr)-1) / elem->size)
+	if(hint < 0 || (int32)hint != hint || (elem->size > 0 && hint > ((uintptr)-1) / elem->size))
 		runtime·panicstring("makechan: size out of range");
 
 	if(elem->alg >= nelem(runtime·algarray)) {
@@ -170,6 +160,7 @@ void
 runtime·chansend(Hchan *c, byte *ep, bool *pres)
 {
 	SudoG *sg;
+	SudoG mysg;
 	G* gp;
 
 	if(c == nil)
@@ -185,21 +176,20 @@ runtime·chansend(Hchan *c, byte *ep, bool *pres)
 	}
 
 	runtime·lock(c);
-loop:
 	if(c->closed)
 		goto closed;
 
 	if(c->dataqsiz > 0)
 		goto asynch;
 
-	sg = dequeue(&c->recvq, c);
+	sg = dequeue(&c->recvq);
 	if(sg != nil) {
-		if(ep != nil)
-			c->elemalg->copy(c->elemsize, sg->elem, ep);
-
+		runtime·unlock(c);
+		
 		gp = sg->g;
 		gp->param = sg;
-		runtime·unlock(c);
+		if(sg->elem != nil)
+			c->elemalg->copy(c->elemsize, sg->elem, ep);
 		runtime·ready(gp);
 
 		if(pres != nil)
@@ -213,21 +203,22 @@ loop:
 		return;
 	}
 
-	sg = allocsg(c);
-	if(ep != nil)
-		c->elemalg->copy(c->elemsize, sg->elem, ep);
+	mysg.elem = ep;
+	mysg.g = g;
+	mysg.selgen = NOSELGEN;
 	g->param = nil;
 	g->status = Gwaiting;
-	enqueue(&c->sendq, sg);
+	enqueue(&c->sendq, &mysg);
 	runtime·unlock(c);
 	runtime·gosched();
 
-	runtime·lock(c);
-	sg = g->param;
-	if(sg == nil)
-		goto loop;
-	freesg(c, sg);
-	runtime·unlock(c);
+	if(g->param == nil) {
+		runtime·lock(c);
+		if(!c->closed)
+			runtime·throw("chansend: spurious wakeup");
+		goto closed;
+	}
+
 	return;
 
 asynch:
@@ -240,25 +231,25 @@ asynch:
 			*pres = false;
 			return;
 		}
-		sg = allocsg(c);
+		mysg.g = g;
+		mysg.elem = nil;
+		mysg.selgen = NOSELGEN;
 		g->status = Gwaiting;
-		enqueue(&c->sendq, sg);
+		enqueue(&c->sendq, &mysg);
 		runtime·unlock(c);
 		runtime·gosched();
 
 		runtime·lock(c);
 		goto asynch;
 	}
-	if(ep != nil)
-		c->elemalg->copy(c->elemsize, chanbuf(c, c->sendx), ep);
+	c->elemalg->copy(c->elemsize, chanbuf(c, c->sendx), ep);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
 	c->qcount++;
 
-	sg = dequeue(&c->recvq, c);
+	sg = dequeue(&c->recvq);
 	if(sg != nil) {
 		gp = sg->g;
-		freesg(c, sg);
 		runtime·unlock(c);
 		runtime·ready(gp);
 	} else
@@ -277,6 +268,7 @@ void
 runtime·chanrecv(Hchan* c, byte *ep, bool *selected, bool *received)
 {
 	SudoG *sg;
+	SudoG mysg;
 	G *gp;
 
 	if(c == nil)
@@ -289,23 +281,20 @@ runtime·chanrecv(Hchan* c, byte *ep, bool *selected, bool *received)
 		runtime·printf("chanrecv: chan=%p\n", c);
 
 	runtime·lock(c);
-
-loop:
 	if(c->dataqsiz > 0)
 		goto asynch;
 
 	if(c->closed)
 		goto closed;
 
-	sg = dequeue(&c->sendq, c);
+	sg = dequeue(&c->sendq);
 	if(sg != nil) {
+		runtime·unlock(c);
+
 		if(ep != nil)
 			c->elemalg->copy(c->elemsize, ep, sg->elem);
-		c->elemalg->copy(c->elemsize, sg->elem, nil);
-
 		gp = sg->g;
 		gp->param = sg;
-		runtime·unlock(c);
 		runtime·ready(gp);
 
 		if(selected != nil)
@@ -321,25 +310,24 @@ loop:
 		return;
 	}
 
-	sg = allocsg(c);
+	mysg.elem = ep;
+	mysg.g = g;
+	mysg.selgen = NOSELGEN;
 	g->param = nil;
 	g->status = Gwaiting;
-	enqueue(&c->recvq, sg);
+	enqueue(&c->recvq, &mysg);
 	runtime·unlock(c);
 	runtime·gosched();
 
-	runtime·lock(c);
-	sg = g->param;
-	if(sg == nil)
-		goto loop;
+	if(g->param == nil) {
+		runtime·lock(c);
+		if(!c->closed)
+			runtime·throw("chanrecv: spurious wakeup");
+		goto closed;
+	}
 
-	if(ep != nil)
-		c->elemalg->copy(c->elemsize, ep, sg->elem);
-	c->elemalg->copy(c->elemsize, sg->elem, nil);
 	if(received != nil)
 		*received = true;
-	freesg(c, sg);
-	runtime·unlock(c);
 	return;
 
 asynch:
@@ -354,9 +342,11 @@ asynch:
 				*received = false;
 			return;
 		}
-		sg = allocsg(c);
+		mysg.g = g;
+		mysg.elem = nil;
+		mysg.selgen = NOSELGEN;
 		g->status = Gwaiting;
-		enqueue(&c->recvq, sg);
+		enqueue(&c->recvq, &mysg);
 		runtime·unlock(c);
 		runtime·gosched();
 
@@ -369,10 +359,10 @@ asynch:
 	if(++c->recvx == c->dataqsiz)
 		c->recvx = 0;
 	c->qcount--;
-	sg = dequeue(&c->sendq, c);
+
+	sg = dequeue(&c->sendq);
 	if(sg != nil) {
 		gp = sg->g;
-		freesg(c, sg);
 		runtime·unlock(c);
 		runtime·ready(gp);
 	} else
@@ -437,7 +427,7 @@ runtime·chanrecv2(Hchan* c, ...)
 
 	o = runtime·rnd(sizeof(c), Structrnd);
 	ae = (byte*)&c + o;
-	o = runtime·rnd(o+c->elemsize, 1);
+	o += c->elemsize;
 	ac = (byte*)&c + o;
 
 	runtime·chanrecv(c, ae, nil, ac);
@@ -619,57 +609,56 @@ newselect(int32 size, Select **selp)
 	if(size > 1)
 		n = size-1;
 
-	sel = runtime·mal(sizeof(*sel) + n*sizeof(sel->scase[0]) + size*sizeof(sel->order[0]));
+	sel = runtime·mal(sizeof(*sel) +
+		n*sizeof(sel->scase[0]) +
+		size*sizeof(sel->lockorder[0]) +
+		size*sizeof(sel->pollorder[0]));
 
 	sel->tcase = size;
 	sel->ncase = 0;
-	sel->order = (void*)(sel->scase + size);
+	sel->pollorder = (void*)(sel->scase + size);
+	sel->lockorder = (void*)(sel->pollorder + size);
 	*selp = sel;
+
 	if(debug)
 		runtime·printf("newselect s=%p size=%d\n", sel, size);
 }
 
 // cut in half to give stack a chance to split
-static void selectsend(Select **selp, Hchan *c, void *pc);
+static void selectsend(Select *sel, Hchan *c, void *pc, void *elem, int32 so);
 
-// selectsend(sel *byte, hchan *chan any, elem any) (selected bool);
+// selectsend(sel *byte, hchan *chan any, elem *any) (selected bool);
 #pragma textflag 7
 void
-runtime·selectsend(Select *sel, Hchan *c, ...)
+runtime·selectsend(Select *sel, Hchan *c, void *elem, bool selected)
 {
+	selected = false;
+	FLUSH(&selected);
+
 	// nil cases do not compete
 	if(c == nil)
 		return;
 	
-	selectsend(&sel, c, runtime·getcallerpc(&sel));
+	selectsend(sel, c, runtime·getcallerpc(&sel), elem, (byte*)&selected - (byte*)&sel);
 }
 
 static void
-selectsend(Select **selp, Hchan *c, void *pc)
+selectsend(Select *sel, Hchan *c, void *pc, void *elem, int32 so)
 {
-	int32 i, eo;
+	int32 i;
 	Scase *cas;
-	byte *ae;
-	Select *sel;
 	
-	sel = *selp;
 	i = sel->ncase;
 	if(i >= sel->tcase)
 		runtime·throw("selectsend: too many cases");
 	sel->ncase = i+1;
-	cas = runtime·mal(sizeof *cas + c->elemsize - sizeof(cas->u.elem));
-	sel->scase[i] = cas;
+	cas = &sel->scase[i];
 
 	cas->pc = pc;
 	cas->chan = c;
-
-	eo = runtime·rnd(sizeof(sel), sizeof(c));
-	eo = runtime·rnd(eo+sizeof(c), c->elemsize);
-	cas->so = runtime·rnd(eo+c->elemsize, Structrnd);
+	cas->so = so;
 	cas->kind = CaseSend;
-
-	ae = (byte*)selp + eo;
-	c->elemalg->copy(c->elemsize, cas->u.elem, ae);
+	cas->sg.elem = elem;
 
 	if(debug)
 		runtime·printf("selectsend s=%p pc=%p chan=%p so=%d\n",
@@ -684,6 +673,9 @@ static void selectrecv(Select *sel, Hchan *c, void *pc, void *elem, bool*, int32
 void
 runtime·selectrecv(Select *sel, Hchan *c, void *elem, bool selected)
 {
+	selected = false;
+	FLUSH(&selected);
+
 	// nil cases do not compete
 	if(c == nil)
 		return;
@@ -696,6 +688,9 @@ runtime·selectrecv(Select *sel, Hchan *c, void *elem, bool selected)
 void
 runtime·selectrecv2(Select *sel, Hchan *c, void *elem, bool *received, bool selected)
 {
+	selected = false;
+	FLUSH(&selected);
+
 	// nil cases do not compete
 	if(c == nil)
 		return;
@@ -713,16 +708,14 @@ selectrecv(Select *sel, Hchan *c, void *pc, void *elem, bool *received, int32 so
 	if(i >= sel->tcase)
 		runtime·throw("selectrecv: too many cases");
 	sel->ncase = i+1;
-	cas = runtime·mal(sizeof *cas);
-	sel->scase[i] = cas;
+	cas = &sel->scase[i];
 	cas->pc = pc;
 	cas->chan = c;
 
 	cas->so = so;
 	cas->kind = CaseRecv;
-	cas->u.recv.elemp = elem;
-	cas->u.recv.receivedp = nil;
-	cas->u.recv.receivedp = received;
+	cas->sg.elem = elem;
+	cas->receivedp = received;
 
 	if(debug)
 		runtime·printf("selectrecv s=%p pc=%p chan=%p so=%d\n",
@@ -737,6 +730,9 @@ static void selectdefault(Select*, void*, int32);
 void
 runtime·selectdefault(Select *sel, bool selected)
 {
+	selected = false;
+	FLUSH(&selected);
+
 	selectdefault(sel, runtime·getcallerpc(&sel), (byte*)&selected - (byte*)&sel);
 }
 
@@ -750,8 +746,7 @@ selectdefault(Select *sel, void *callerpc, int32 so)
 	if(i >= sel->tcase)
 		runtime·throw("selectdefault: too many cases");
 	sel->ncase = i+1;
-	cas = runtime·mal(sizeof *cas);
-	sel->scase[i] = cas;
+	cas = &sel->scase[i];
 	cas->pc = callerpc;
 	cas->chan = nil;
 
@@ -764,25 +759,16 @@ selectdefault(Select *sel, void *callerpc, int32 so)
 }
 
 static void
-freesel(Select *sel)
-{
-	uint32 i;
-
-	for(i=0; i<sel->ncase; i++)
-		runtime·free(sel->scase[i]);
-	runtime·free(sel);
-}
-
-static void
 sellock(Select *sel)
 {
 	uint32 i;
-	Hchan *c;
+	Hchan *c, *c0;
 
 	c = nil;
 	for(i=0; i<sel->ncase; i++) {
-		if(sel->scase[i]->chan != c) {
-			c = sel->scase[i]->chan;
+		c0 = sel->lockorder[i];
+		if(c0 && c0 != c) {
+			c = sel->lockorder[i];
 			runtime·lock(c);
 		}
 	}
@@ -792,12 +778,13 @@ static void
 selunlock(Select *sel)
 {
 	uint32 i;
-	Hchan *c;
+	Hchan *c, *c0;
 
 	c = nil;
-	for(i=sel->ncase; i>0; i--) {
-		if(sel->scase[i-1]->chan && sel->scase[i-1]->chan != c) {
-			c = sel->scase[i-1]->chan;
+	for(i=sel->ncase; i-->0;) {
+		c0 = sel->lockorder[i];
+		if(c0 && c0 != c) {
+			c = c0;
 			runtime·unlock(c);
 		}
 	}
@@ -852,20 +839,20 @@ selectgo(Select **selp)
 
 	// generate permuted order
 	for(i=0; i<sel->ncase; i++)
-		sel->order[i] = i;
+		sel->pollorder[i] = i;
 	for(i=1; i<sel->ncase; i++) {
-		o = sel->order[i];
-		j = fastrandn(i+1);
-		sel->order[i] = sel->order[j];
-		sel->order[j] = o;
+		o = sel->pollorder[i];
+		j = runtime·fastrand1()%(i+1);
+		sel->pollorder[i] = sel->pollorder[j];
+		sel->pollorder[j] = o;
 	}
 
 	// sort the cases by Hchan address to get the locking order.
-	for(i=1; i<sel->ncase; i++) {
-		cas = sel->scase[i];
-		for(j=i; j>0 && sel->scase[j-1]->chan >= cas->chan; j--)
-			sel->scase[j] = sel->scase[j-1];
-		sel->scase[j] = cas;
+	for(i=0; i<sel->ncase; i++) {
+		c = sel->scase[i].chan;
+		for(j=i; j>0 && sel->lockorder[j-1] >= c; j--)
+			sel->lockorder[j] = sel->lockorder[j-1];
+		sel->lockorder[j] = c;
 	}
 	sellock(sel);
 
@@ -873,8 +860,8 @@ loop:
 	// pass 1 - look for something already waiting
 	dfl = nil;
 	for(i=0; i<sel->ncase; i++) {
-		o = sel->order[i];
-		cas = sel->scase[o];
+		o = sel->pollorder[i];
+		cas = &sel->scase[o];
 		c = cas->chan;
 
 		switch(cas->kind) {
@@ -883,7 +870,7 @@ loop:
 				if(c->qcount > 0)
 					goto asyncrecv;
 			} else {
-				sg = dequeue(&c->sendq, c);
+				sg = dequeue(&c->sendq);
 				if(sg != nil)
 					goto syncrecv;
 			}
@@ -898,7 +885,7 @@ loop:
 				if(c->qcount < c->dataqsiz)
 					goto asyncsend;
 			} else {
-				sg = dequeue(&c->recvq, c);
+				sg = dequeue(&c->recvq);
 				if(sg != nil)
 					goto syncsend;
 			}
@@ -911,6 +898,7 @@ loop:
 	}
 
 	if(dfl != nil) {
+		selunlock(sel);
 		cas = dfl;
 		goto retc;
 	}
@@ -918,11 +906,11 @@ loop:
 
 	// pass 2 - enqueue on all chans
 	for(i=0; i<sel->ncase; i++) {
-		o = sel->order[i];
-		cas = sel->scase[o];
+		cas = &sel->scase[i];
 		c = cas->chan;
-		sg = allocsg(c);
-		sg->offset = o;
+		sg = &cas->sg;
+		sg->g = g;
+		sg->selgen = g->selgen;
 
 		switch(cas->kind) {
 		case CaseRecv:
@@ -930,8 +918,6 @@ loop:
 			break;
 		
 		case CaseSend:
-			if(c->dataqsiz == 0)
-				c->elemalg->copy(c->elemsize, sg->elem, cas->u.elem);
 			enqueue(&c->sendq, sg);
 			break;
 		}
@@ -948,85 +934,82 @@ loop:
 	// pass 3 - dequeue from unsuccessful chans
 	// otherwise they stack up on quiet channels
 	for(i=0; i<sel->ncase; i++) {
-		if(sg == nil || i != sg->offset) {
-			cas = sel->scase[i];
+		cas = &sel->scase[i];
+		if(cas != (Scase*)sg) {
 			c = cas->chan;
 			if(cas->kind == CaseSend)
-				dequeueg(&c->sendq, c);
+				dequeueg(&c->sendq);
 			else
-				dequeueg(&c->recvq, c);
+				dequeueg(&c->recvq);
 		}
 	}
 
 	if(sg == nil)
 		goto loop;
 
-	o = sg->offset;
-	cas = sel->scase[o];
+	cas = (Scase*)sg;
 	c = cas->chan;
 
-	if(c->dataqsiz > 0) {
-//		prints("shouldnt happen\n");
-		goto loop;
-	}
+	if(c->dataqsiz > 0)
+		runtime·throw("selectgo: shouldnt happen");
 
 	if(debug)
-		runtime·printf("wait-return: sel=%p c=%p cas=%p kind=%d o=%d\n",
-			sel, c, cas, cas->kind, o);
+		runtime·printf("wait-return: sel=%p c=%p cas=%p kind=%d\n",
+			sel, c, cas, cas->kind);
 
 	if(cas->kind == CaseRecv) {
-		if(cas->u.recv.receivedp != nil)
-			*cas->u.recv.receivedp = true;
-		if(cas->u.recv.elemp != nil)
-			c->elemalg->copy(c->elemsize, cas->u.recv.elemp, sg->elem);
-		c->elemalg->copy(c->elemsize, sg->elem, nil);
+		if(cas->receivedp != nil)
+			*cas->receivedp = true;
 	}
 
-	freesg(c, sg);
+	selunlock(sel);
 	goto retc;
 
 asyncrecv:
 	// can receive from buffer
-	if(cas->u.recv.receivedp != nil)
-		*cas->u.recv.receivedp = true;
-	if(cas->u.recv.elemp != nil)
-		c->elemalg->copy(c->elemsize, cas->u.recv.elemp, chanbuf(c, c->recvx));
+	if(cas->receivedp != nil)
+		*cas->receivedp = true;
+	if(cas->sg.elem != nil)
+		c->elemalg->copy(c->elemsize, cas->sg.elem, chanbuf(c, c->recvx));
 	c->elemalg->copy(c->elemsize, chanbuf(c, c->recvx), nil);
 	if(++c->recvx == c->dataqsiz)
 		c->recvx = 0;
 	c->qcount--;
-	sg = dequeue(&c->sendq, c);
+	sg = dequeue(&c->sendq);
 	if(sg != nil) {
 		gp = sg->g;
-		freesg(c, sg);
+		selunlock(sel);
 		runtime·ready(gp);
+	} else {
+		selunlock(sel);
 	}
 	goto retc;
 
 asyncsend:
 	// can send to buffer
-	if(cas->u.elem != nil)
-		c->elemalg->copy(c->elemsize, chanbuf(c, c->sendx), cas->u.elem);
+	c->elemalg->copy(c->elemsize, chanbuf(c, c->sendx), cas->sg.elem);
 	if(++c->sendx == c->dataqsiz)
 		c->sendx = 0;
 	c->qcount++;
-	sg = dequeue(&c->recvq, c);
+	sg = dequeue(&c->recvq);
 	if(sg != nil) {
 		gp = sg->g;
-		freesg(c, sg);
+		selunlock(sel);
 		runtime·ready(gp);
+	} else {
+		selunlock(sel);
 	}
 	goto retc;
 
 syncrecv:
 	// can receive from sleeping sender (sg)
+	selunlock(sel);
 	if(debug)
 		runtime·printf("syncrecv: sel=%p c=%p o=%d\n", sel, c, o);
-	if(cas->u.recv.receivedp != nil)
-		*cas->u.recv.receivedp = true;
-	if(cas->u.recv.elemp != nil)
-		c->elemalg->copy(c->elemsize, cas->u.recv.elemp, sg->elem);
-	c->elemalg->copy(c->elemsize, sg->elem, nil);
+	if(cas->receivedp != nil)
+		*cas->receivedp = true;
+	if(cas->sg.elem != nil)
+		c->elemalg->copy(c->elemsize, cas->sg.elem, sg->elem);
 	gp = sg->g;
 	gp->param = sg;
 	runtime·ready(gp);
@@ -1034,30 +1017,28 @@ syncrecv:
 
 rclose:
 	// read at end of closed channel
-	if(cas->u.recv.receivedp != nil)
-		*cas->u.recv.receivedp = false;
-	if(cas->u.recv.elemp != nil)
-		c->elemalg->copy(c->elemsize, cas->u.recv.elemp, nil);
+	selunlock(sel);
+	if(cas->receivedp != nil)
+		*cas->receivedp = false;
+	if(cas->sg.elem != nil)
+		c->elemalg->copy(c->elemsize, cas->sg.elem, nil);
 	goto retc;
 
 syncsend:
 	// can send to sleeping receiver (sg)
+	selunlock(sel);
 	if(debug)
 		runtime·printf("syncsend: sel=%p c=%p o=%d\n", sel, c, o);
-	if(c->closed)
-		goto sclose;
-	c->elemalg->copy(c->elemsize, sg->elem, cas->u.elem);
+	c->elemalg->copy(c->elemsize, sg->elem, cas->sg.elem);
 	gp = sg->g;
 	gp->param = sg;
 	runtime·ready(gp);
 
 retc:
-	selunlock(sel);
-
 	// return to pc corresponding to chosen case
 	pc = cas->pc;
 	as = (byte*)selp + cas->so;
-	freesel(sel);
+	runtime·free(sel);
 	*as = true;
 	return pc;
 
@@ -1088,23 +1069,21 @@ runtime·closechan(Hchan *c)
 
 	// release all readers
 	for(;;) {
-		sg = dequeue(&c->recvq, c);
+		sg = dequeue(&c->recvq);
 		if(sg == nil)
 			break;
 		gp = sg->g;
 		gp->param = nil;
-		freesg(c, sg);
 		runtime·ready(gp);
 	}
 
 	// release all writers
 	for(;;) {
-		sg = dequeue(&c->sendq, c);
+		sg = dequeue(&c->sendq);
 		if(sg == nil)
 			break;
 		gp = sg->g;
 		gp->param = nil;
-		freesg(c, sg);
 		runtime·ready(gp);
 	}
 
@@ -1144,7 +1123,7 @@ reflect·chancap(Hchan *c, int32 cap)
 }
 
 static SudoG*
-dequeue(WaitQ *q, Hchan *c)
+dequeue(WaitQ *q)
 {
 	SudoG *sgp;
 
@@ -1155,9 +1134,10 @@ loop:
 	q->first = sgp->link;
 
 	// if sgp is stale, ignore it
-	if(!runtime·cas(&sgp->g->selgen, sgp->selgen, sgp->selgen + 1)) {
+	if(sgp->selgen != NOSELGEN &&
+		(sgp->selgen != sgp->g->selgen ||
+		!runtime·cas(&sgp->g->selgen, sgp->selgen, sgp->selgen + 2))) {
 		//prints("INVALID PSEUDOG POINTER\n");
-		freesg(c, sgp);
 		goto loop;
 	}
 
@@ -1165,14 +1145,16 @@ loop:
 }
 
 static void
-dequeueg(WaitQ *q, Hchan *c)
+dequeueg(WaitQ *q)
 {
-	SudoG **l, *sgp;
-	
-	for(l=&q->first; (sgp=*l) != nil; l=&sgp->link) {
+	SudoG **l, *sgp, *prevsgp;
+
+	prevsgp = nil;
+	for(l=&q->first; (sgp=*l) != nil; l=&sgp->link, prevsgp=sgp) {
 		if(sgp->g == g) {
 			*l = sgp->link;
-			freesg(c, sgp);
+			if(q->last == sgp)
+				q->last = prevsgp;
 			break;
 		}
 	}
@@ -1190,62 +1172,3 @@ enqueue(WaitQ *q, SudoG *sgp)
 	q->last->link = sgp;
 	q->last = sgp;
 }
-
-static SudoG*
-allocsg(Hchan *c)
-{
-	SudoG* sg;
-
-	sg = c->free;
-	if(sg != nil) {
-		c->free = sg->link;
-	} else
-		sg = runtime·mal(sizeof(*sg) + c->elemsize - sizeof(sg->elem));
-	sg->selgen = g->selgen;
-	sg->g = g;
-	sg->offset = 0;
-	sg->isfree = 0;
-
-	return sg;
-}
-
-static void
-freesg(Hchan *c, SudoG *sg)
-{
-	if(sg != nil) {
-		if(sg->isfree)
-			runtime·throw("chan.freesg: already free");
-		sg->isfree = 1;
-		sg->link = c->free;
-		c->free = sg;
-	}
-}
-
-static uint32
-fastrand1(void)
-{
-	static uint32 x = 0x49f6428aUL;
-
-	x += x;
-	if(x & 0x80000000L)
-		x ^= 0x88888eefUL;
-	return x;
-}
-
-static uint32
-fastrandn(uint32 n)
-{
-	uint32 max, r;
-
-	if(n <= 1)
-		return 0;
-
-	r = fastrand1();
-	if(r < (1ULL<<31)-n)  // avoid computing max in common case
-		return r%n;
-
-	max = (1ULL<<31)/n * n;
-	while(r >= max)
-		r = fastrand1();
-	return r%n;
-}
diff --git a/src/pkg/runtime/chan_test.go b/src/pkg/runtime/chan_test.go
new file mode 100644
index 000000000..c5ffe93ac
--- /dev/null
+++ b/src/pkg/runtime/chan_test.go
@@ -0,0 +1,267 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func TestChanSendInterface(t *testing.T) {
+	type mt struct{}
+	m := &mt{}
+	c := make(chan interface{}, 1)
+	c <- m
+	select {
+	case c <- m:
+	default:
+	}
+	select {
+	case c <- m:
+	case c <- &mt{}:
+	default:
+	}
+}
+
+func BenchmarkSelectUncontended(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			myc1 := make(chan int, 1)
+			myc2 := make(chan int, 1)
+			myc1 <- 0
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					select {
+					case <-myc1:
+						myc2 <- 0
+					case <-myc2:
+						myc1 <- 0
+					}
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkSelectContended(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	myc1 := make(chan int, procs)
+	myc2 := make(chan int, procs)
+	for p := 0; p < procs; p++ {
+		myc1 <- 0
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					select {
+					case <-myc1:
+						myc2 <- 0
+					case <-myc2:
+						myc1 <- 0
+					}
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkSelectNonblock(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			myc1 := make(chan int)
+			myc2 := make(chan int)
+			myc3 := make(chan int, 1)
+			myc4 := make(chan int, 1)
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					select {
+					case <-myc1:
+					default:
+					}
+					select {
+					case myc2 <- 0:
+					default:
+					}
+					select {
+					case <-myc3:
+					default:
+					}
+					select {
+					case myc4 <- 0:
+					default:
+					}
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkChanUncontended(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			myc := make(chan int, CallsPerSched)
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					myc <- 0
+				}
+				for g := 0; g < CallsPerSched; g++ {
+					<-myc
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkChanContended(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	myc := make(chan int, procs*CallsPerSched)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					myc <- 0
+				}
+				for g := 0; g < CallsPerSched; g++ {
+					<-myc
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkChanSync(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := 2
+	N := int32(b.N / CallsPerSched / procs * procs)
+	c := make(chan bool, procs)
+	myc := make(chan int)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for {
+				i := atomic.AddInt32(&N, -1)
+				if i < 0 {
+					break
+				}
+				for g := 0; g < CallsPerSched; g++ {
+					if i%2 == 0 {
+						<-myc
+						myc <- 0
+					} else {
+						myc <- 0
+						<-myc
+					}
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, 2*procs)
+	myc := make(chan int, chanSize)
+	for p := 0; p < procs; p++ {
+		go func() {
+			foo := 0
+			for atomic.AddInt32(&N, -1) >= 0 {
+				for g := 0; g < CallsPerSched; g++ {
+					for i := 0; i < localWork; i++ {
+						foo *= 2
+						foo /= 2
+					}
+					myc <- 1
+				}
+			}
+			myc <- 0
+			c <- foo == 42
+		}()
+		go func() {
+			foo := 0
+			for {
+				v := <-myc
+				if v == 0 {
+					break
+				}
+				for i := 0; i < localWork; i++ {
+					foo *= 2
+					foo /= 2
+				}
+			}
+			c <- foo == 42
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+		<-c
+	}
+}
+
+func BenchmarkChanProdCons0(b *testing.B) {
+	benchmarkChanProdCons(b, 0, 0)
+}
+
+func BenchmarkChanProdCons10(b *testing.B) {
+	benchmarkChanProdCons(b, 10, 0)
+}
+
+func BenchmarkChanProdCons100(b *testing.B) {
+	benchmarkChanProdCons(b, 100, 0)
+}
+
+func BenchmarkChanProdConsWork0(b *testing.B) {
+	benchmarkChanProdCons(b, 0, 100)
+}
+
+func BenchmarkChanProdConsWork10(b *testing.B) {
+	benchmarkChanProdCons(b, 10, 100)
+}
+
+func BenchmarkChanProdConsWork100(b *testing.B) {
+	benchmarkChanProdCons(b, 100, 100)
+}
diff --git a/src/pkg/runtime/cpuprof.c b/src/pkg/runtime/cpuprof.c
index 6233bcb45..74b795b7e 100644
--- a/src/pkg/runtime/cpuprof.c
+++ b/src/pkg/runtime/cpuprof.c
@@ -121,6 +121,10 @@ runtime·SetCPUProfileRate(int32 hz)
 {
 	uintptr *p;
 	uintptr n;
+	
+	// Call findfunc now so that it won't have to
+	// build tables during the signal handler.
+	runtime·findfunc(0);
 
 	// Clamp hz to something reasonable.
 	if(hz < 0)
diff --git a/src/pkg/runtime/debug/stack_test.go b/src/pkg/runtime/debug/stack_test.go
index 4aeea13ff..94293bb93 100644
--- a/src/pkg/runtime/debug/stack_test.go
+++ b/src/pkg/runtime/debug/stack_test.go
@@ -23,7 +23,7 @@ func (t T) method() []byte {
 	Don't worry much about the base levels, but check the ones in our own package.
 
 		/Users/r/go/src/pkg/runtime/debug/stack_test.go:15 (0x13878)
-			*T.ptrmethod: return Stack()
+			(*T).ptrmethod: return Stack()
 		/Users/r/go/src/pkg/runtime/debug/stack_test.go:18 (0x138dd)
 			T.method: return t.ptrmethod()
 		/Users/r/go/src/pkg/runtime/debug/stack_test.go:23 (0x13920)
@@ -40,7 +40,7 @@ func TestStack(t *testing.T) {
 		t.Fatal("too few lines")
 	}
 	check(t, lines[0], "src/pkg/runtime/debug/stack_test.go")
-	check(t, lines[1], "\t*T.ptrmethod: return Stack()")
+	check(t, lines[1], "\t(*T).ptrmethod: return Stack()")
 	check(t, lines[2], "src/pkg/runtime/debug/stack_test.go")
 	check(t, lines[3], "\tT.method: return t.ptrmethod()")
 	check(t, lines[4], "src/pkg/runtime/debug/stack_test.go")
diff --git a/src/pkg/runtime/export_test.go b/src/pkg/runtime/export_test.go
index 58631c7b4..53c5fcba4 100644
--- a/src/pkg/runtime/export_test.go
+++ b/src/pkg/runtime/export_test.go
@@ -15,3 +15,9 @@ var F32to64 = f32to64
 var Fcmp64 = fcmp64
 var Fintto64 = fintto64
 var F64toint = f64toint
+
+func entersyscall()
+func exitsyscall()
+
+var Entersyscall = entersyscall
+var Exitsyscall = exitsyscall
diff --git a/src/pkg/runtime/freebsd/386/signal.c b/src/pkg/runtime/freebsd/386/signal.c
index 3600f0762..2fe7ecd70 100644
--- a/src/pkg/runtime/freebsd/386/signal.c
+++ b/src/pkg/runtime/freebsd/386/signal.c
@@ -111,6 +111,8 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 	runtime·exit(2);
 }
 
+// Called from kernel on signal stack, so no stack split.
+#pragma textflag 7
 void
 runtime·sigignore(void)
 {
diff --git a/src/pkg/runtime/freebsd/amd64/signal.c b/src/pkg/runtime/freebsd/amd64/signal.c
index 85cb1d855..8015e366e 100644
--- a/src/pkg/runtime/freebsd/amd64/signal.c
+++ b/src/pkg/runtime/freebsd/amd64/signal.c
@@ -119,6 +119,8 @@ runtime·sighandler(int32 sig, Siginfo *info, void *context, G *gp)
 	runtime·exit(2);
 }
 
+// Called from kernel on signal stack, so no stack split.
+#pragma textflag 7
 void
 runtime·sigignore(void)
 {
diff --git a/src/pkg/runtime/goc2c.c b/src/pkg/runtime/goc2c.c
index 826ceff3a..61236e226 100644
--- a/src/pkg/runtime/goc2c.c
+++ b/src/pkg/runtime/goc2c.c
@@ -2,26 +2,27 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-/* Translate a .goc file into a .c file.  A .goc file is a combination
-   of a limited form of Go with C.  */
+/*
+ * Translate a .goc file into a .c file.  A .goc file is a combination
+ * of a limited form of Go with C.
+ */
 
 /*
-   package PACKAGENAME
-   {# line}
-   func NAME([NAME TYPE { , NAME TYPE }]) [(NAME TYPE { , NAME TYPE })] \{
-     C code with proper brace nesting
-   \}
+	package PACKAGENAME
+	{# line}
+	func NAME([NAME TYPE { , NAME TYPE }]) [(NAME TYPE { , NAME TYPE })] \{
+	  C code with proper brace nesting
+	\}
 */
 
-/* We generate C code which implements the function such that it can
-   be called from Go and executes the C code.  */
+/*
+ * We generate C code which implements the function such that it can
+ * be called from Go and executes the C code.
+ */
 
-#include <assert.h>
-#include <ctype.h>
+#include <u.h>
 #include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
+#include <libc.h>
 
 /* Whether we're emitting for gcc */
 static int gcc;
@@ -88,16 +89,14 @@ int structround = 4;
 static void
 bad_eof(void)
 {
-	fprintf(stderr, "%s:%u: unexpected EOF\n", file, lineno);
-	exit(1);
+	sysfatal("%s:%ud: unexpected EOF\n", file, lineno);
 }
 
 /* Out of memory.  */
 static void
 bad_mem(void)
 {
-	fprintf(stderr, "%s:%u: out of memory\n", file, lineno);
-	exit(1);
+	sysfatal("%s:%ud: out of memory\n", file, lineno);
 }
 
 /* Allocate memory without fail.  */
@@ -196,8 +195,10 @@ getchar_skipping_comments(void)
 	}
 }
 
-/* Read and return a token.  Tokens are delimited by whitespace or by
-   [(),{}].  The latter are all returned as single characters.  */
+/*
+ * Read and return a token.  Tokens are delimited by whitespace or by
+ * [(),{}].  The latter are all returned as single characters.
+ */
 static char *
 read_token(void)
 {
@@ -259,11 +260,11 @@ read_package(void)
 	char *token;
 
 	token = read_token_no_eof();
+	if (token == nil)
+		sysfatal("%s:%ud: no token\n", file, lineno);
 	if (strcmp(token, "package") != 0) {
-		fprintf(stderr,
-			"%s:%u: expected \"package\", got \"%s\"\n",
+		sysfatal("%s:%ud: expected \"package\", got \"%s\"\n",
 			file, lineno, token);
-		exit(1);
 	}
 	return read_token_no_eof();
 }
@@ -290,8 +291,10 @@ read_preprocessor_lines(void)
 	}
 }
 
-/* Read a type in Go syntax and return a type in C syntax.  We only
-   permit basic types and pointers.  */
+/*
+ * Read a type in Go syntax and return a type in C syntax.  We only
+ * permit basic types and pointers.
+ */
 static char *
 read_type(void)
 {
@@ -333,13 +336,14 @@ type_size(char *p)
 	for(i=0; type_table[i].name; i++)
 		if(strcmp(type_table[i].name, p) == 0)
 			return type_table[i].size;
-	fprintf(stderr, "%s:%u: unknown type %s\n", file, lineno, p);
-	exit(1);
+	sysfatal("%s:%ud: unknown type %s\n", file, lineno, p);
 	return 0;
 }
 
-/* Read a list of parameters.  Each parameter is a name and a type.
-   The list ends with a ')'.  We have already read the '('.  */
+/*
+ * Read a list of parameters.  Each parameter is a name and a type.
+ * The list ends with a ')'.  We have already read the '('.
+ */
 static struct params *
 read_params(int *poffset)
 {
@@ -375,17 +379,18 @@ read_params(int *poffset)
 		}
 	}
 	if (strcmp(token, ")") != 0) {
-		fprintf(stderr, "%s:%u: expected '('\n",
+		sysfatal("%s:%ud: expected '('\n",
 			file, lineno);
-		exit(1);
 	}
 	if (poffset != NULL)
 		*poffset = offset;
 	return ret;
 }
 
-/* Read a function header.  This reads up to and including the initial
-   '{' character.  Returns 1 if it read a header, 0 at EOF.  */
+/*
+ * Read a function header.  This reads up to and including the initial
+ * '{' character.  Returns 1 if it read a header, 0 at EOF.
+ */
 static int
 read_func_header(char **name, struct params **params, int *paramwid, struct params **rets)
 {
@@ -416,9 +421,8 @@ read_func_header(char **name, struct params **params, int *paramwid, struct para
 
 	token = read_token();
 	if (token == NULL || strcmp(token, "(") != 0) {
-		fprintf(stderr, "%s:%u: expected \"(\"\n",
+		sysfatal("%s:%ud: expected \"(\"\n",
 			file, lineno);
-		exit(1);
 	}
 	*params = read_params(paramwid);
 
@@ -430,9 +434,8 @@ read_func_header(char **name, struct params **params, int *paramwid, struct para
 		token = read_token();
 	}
 	if (token == NULL || strcmp(token, "{") != 0) {
-		fprintf(stderr, "%s:%u: expected \"{\"\n",
+		sysfatal("%s:%ud: expected \"{\"\n",
 			file, lineno);
-		exit(1);
 	}
 	return 1;
 }
@@ -581,8 +584,10 @@ write_func_trailer(char *package, char *name,
 		write_6g_func_trailer(rets);
 }
 
-/* Read and write the body of the function, ending in an unnested }
-   (which is read but not written).  */
+/*
+ * Read and write the body of the function, ending in an unnested }
+ * (which is read but not written).
+ */
 static void
 copy_body(void)
 {
@@ -669,15 +674,15 @@ process_file(void)
 static void
 usage(void)
 {
-	fprintf(stderr, "Usage: goc2c [--6g | --gc] [file]\n");
-	exit(1);
+	sysfatal("Usage: goc2c [--6g | --gc] [file]\n");
 }
 
-int
+void
 main(int argc, char **argv)
 {
 	char *goarch;
 
+	argv0 = argv[0];
 	while(argc > 1 && argv[1][0] == '-') {
 		if(strcmp(argv[1], "-") == 0)
 			break;
@@ -694,7 +699,7 @@ main(int argc, char **argv)
 	if(argc <= 1 || strcmp(argv[1], "-") == 0) {
 		file = "<stdin>";
 		process_file();
-		return 0;
+		exits(0);
 	}
 
 	if(argc > 2)
@@ -702,8 +707,7 @@ main(int argc, char **argv)
 
 	file = argv[1];
 	if(freopen(file, "r", stdin) == 0) {
-		fprintf(stderr, "open %s: %s\n", file, strerror(errno));
-		exit(1);
+		sysfatal("open %s: %r\n", file);
 	}
 
 	if(!gcc) {
@@ -719,5 +723,5 @@ main(int argc, char **argv)
 	}
 
 	process_file();
-	return 0;
+	exits(0);
 }
diff --git a/src/pkg/runtime/hashmap.c b/src/pkg/runtime/hashmap.c
index 5ba1eb20a..179a56375 100644
--- a/src/pkg/runtime/hashmap.c
+++ b/src/pkg/runtime/hashmap.c
@@ -753,12 +753,12 @@ runtime·makemap_c(Type *key, Type *val, int64 hint)
 	// func(key) (val[, pres])
 	h->ko1 = runtime·rnd(sizeof(h), key->align);
 	h->vo1 = runtime·rnd(h->ko1+keysize, Structrnd);
-	h->po1 = runtime·rnd(h->vo1+valsize, 1);
+	h->po1 = h->vo1 + valsize;
 
 	// func(key, val[, pres])
 	h->ko2 = runtime·rnd(sizeof(h), key->align);
 	h->vo2 = runtime·rnd(h->ko2+keysize, val->align);
-	h->po2 = runtime·rnd(h->vo2+valsize, 1);
+	h->po2 = h->vo2 + valsize;
 
 	if(debug) {
 		runtime·printf("makemap: map=%p; keysize=%d; valsize=%d; keyalg=%d; valalg=%d; offsets=%d,%d; %d,%d,%d; %d,%d,%d\n",
diff --git a/src/pkg/runtime/hashmap.h b/src/pkg/runtime/hashmap.h
index d0fd3527f..19ff41697 100644
--- a/src/pkg/runtime/hashmap.h
+++ b/src/pkg/runtime/hashmap.h
@@ -65,7 +65,7 @@
 
 #define	malloc		runtime·mal
 #define	memset(a,b,c)	runtime·memclr((byte*)(a), (uint32)(c))
-#define	memcpy(a,b,c)	runtime·mcpy((byte*)(a),(byte*)(b),(uint32)(c))
+#define	memcpy(a,b,c)	runtime·memmove((byte*)(a),(byte*)(b),(uint32)(c))
 #define	assert(a)	if(!(a)) runtime·throw("assert")
 #define free(x)	runtime·free(x)
 #define memmove(a,b,c)	runtime·memmove(a, b, c)
diff --git a/src/pkg/runtime/iface.c b/src/pkg/runtime/iface.c
index b1015f695..000f834cf 100644
--- a/src/pkg/runtime/iface.c
+++ b/src/pkg/runtime/iface.c
@@ -81,7 +81,7 @@ itab(InterfaceType *inter, Type *type, int32 canfail)
 	for(locked=0; locked<2; locked++) {
 		if(locked)
 			runtime·lock(&ifacelock);
-		for(m=hash[h]; m!=nil; m=m->link) {
+		for(m=runtime·atomicloadp(&hash[h]); m!=nil; m=m->link) {
 			if(m->inter == inter && m->type == type) {
 				if(m->bad) {
 					m = nil;
@@ -145,10 +145,11 @@ search:
 	}
 
 out:
+	if(!locked)
+		runtime·panicstring("invalid itab locking");
 	m->link = hash[h];
-	hash[h] = m;
-	if(locked)
-		runtime·unlock(&ifacelock);
+	runtime·atomicstorep(&hash[h], m);
+	runtime·unlock(&ifacelock);
 	if(m->bad)
 		return nil;
 	return m;
@@ -264,7 +265,7 @@ runtime·assertI2T2(Type *t, Iface i, ...)
 
 	ret = (byte*)(&i+1);
 	wid = t->size;
-	ok = (bool*)(ret+runtime·rnd(wid, 1));
+	ok = (bool*)(ret + wid);
 
 	if(i.tab == nil || i.tab->type != t) {
 		*ok = false;
@@ -326,7 +327,7 @@ runtime·assertE2T2(Type *t, Eface e, ...)
 		runtime·throw("invalid interface value");
 	ret = (byte*)(&e+1);
 	wid = t->size;
-	ok = (bool*)(ret+runtime·rnd(wid, 1));
+	ok = (bool*)(ret + wid);
 
 	if(t != e.type) {
 		*ok = false;
diff --git a/src/pkg/runtime/linux/386/defs.h b/src/pkg/runtime/linux/386/defs.h
index 6ae1c4e13..73fe23ef9 100644
--- a/src/pkg/runtime/linux/386/defs.h
+++ b/src/pkg/runtime/linux/386/defs.h
@@ -61,6 +61,8 @@ enum {
 	ITIMER_REAL = 0,
 	ITIMER_VIRTUAL = 0x1,
 	ITIMER_PROF = 0x2,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
 };
 
 // Types
diff --git a/src/pkg/runtime/linux/386/sys.s b/src/pkg/runtime/linux/386/sys.s
index e8b423324..0b4a34986 100644
--- a/src/pkg/runtime/linux/386/sys.s
+++ b/src/pkg/runtime/linux/386/sys.s
@@ -22,9 +22,31 @@ TEXT runtime·exit1(SB),7,$0
 	INT $3	// not reached
 	RET
 
+TEXT runtime·open(SB),7,$0
+	MOVL	$5, AX		// syscall - open
+	MOVL	4(SP), BX
+	MOVL	8(SP), CX
+	MOVL	12(SP), DX
+	INT	$0x80
+	RET
+
+TEXT runtime·close(SB),7,$0
+	MOVL	$6, AX		// syscall - close
+	MOVL	4(SP), BX
+	INT	$0x80
+	RET
+
 TEXT runtime·write(SB),7,$0
 	MOVL	$4, AX		// syscall - write
-	MOVL	4(SP),  BX
+	MOVL	4(SP), BX
+	MOVL	8(SP), CX
+	MOVL	12(SP), DX
+	INT	$0x80
+	RET
+
+TEXT runtime·read(SB),7,$0
+	MOVL	$3, AX		// syscall - read
+	MOVL	4(SP), BX
 	MOVL	8(SP), CX
 	MOVL	12(SP), DX
 	INT	$0x80
@@ -315,3 +337,8 @@ TEXT runtime·setldt(SB),7,$32
 	MOVW	AX, GS
 
 	RET
+
+TEXT runtime·osyield(SB),7,$0
+	MOVL	$158, AX
+	INT	$0x80
+	RET
diff --git a/src/pkg/runtime/linux/amd64/defs.h b/src/pkg/runtime/linux/amd64/defs.h
index 70d63145c..8053dd16f 100644
--- a/src/pkg/runtime/linux/amd64/defs.h
+++ b/src/pkg/runtime/linux/amd64/defs.h
@@ -61,6 +61,8 @@ enum {
 	ITIMER_REAL = 0,
 	ITIMER_VIRTUAL = 0x1,
 	ITIMER_PROF = 0x2,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
 };
 
 // Types
diff --git a/src/pkg/runtime/linux/amd64/sys.s b/src/pkg/runtime/linux/amd64/sys.s
index 66fdab208..8b4dcd921 100644
--- a/src/pkg/runtime/linux/amd64/sys.s
+++ b/src/pkg/runtime/linux/amd64/sys.s
@@ -28,6 +28,12 @@ TEXT runtime·open(SB),7,$0-16
 	SYSCALL
 	RET
 
+TEXT runtime·close(SB),7,$0-16
+	MOVL	8(SP), DI
+	MOVL	$3, AX			// syscall entry
+	SYSCALL
+	RET
+
 TEXT runtime·write(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
@@ -36,6 +42,14 @@ TEXT runtime·write(SB),7,$0-24
 	SYSCALL
 	RET
 
+TEXT runtime·read(SB),7,$0-24
+	MOVL	8(SP), DI
+	MOVQ	16(SP), SI
+	MOVL	24(SP), DX
+	MOVL	$0, AX			// syscall entry
+	SYSCALL
+	RET
+
 TEXT runtime·raisesigpipe(SB),7,$12
 	MOVL	$186, AX	// syscall - gettid
 	SYSCALL
@@ -232,3 +246,7 @@ TEXT runtime·settls(SB),7,$32
 	CALL	runtime·notok(SB)
 	RET
 
+TEXT runtime·osyield(SB),7,$0
+	MOVL	$24, AX
+	SYSCALL
+	RET
diff --git a/src/pkg/runtime/linux/arm/defs.h b/src/pkg/runtime/linux/arm/defs.h
index 6b2f22c66..09b558ed0 100644
--- a/src/pkg/runtime/linux/arm/defs.h
+++ b/src/pkg/runtime/linux/arm/defs.h
@@ -61,6 +61,8 @@ enum {
 	ITIMER_REAL = 0,
 	ITIMER_PROF = 0x2,
 	ITIMER_VIRTUAL = 0x1,
+	O_RDONLY = 0,
+	O_CLOEXEC = 02000000,
 };
 
 // Types
diff --git a/src/pkg/runtime/linux/arm/sys.s b/src/pkg/runtime/linux/arm/sys.s
index ab5349822..8619f0945 100644
--- a/src/pkg/runtime/linux/arm/sys.s
+++ b/src/pkg/runtime/linux/arm/sys.s
@@ -15,7 +15,10 @@
 #define SYS_BASE 0x0
 
 #define SYS_exit (SYS_BASE + 1)
+#define SYS_read (SYS_BASE + 3)
 #define SYS_write (SYS_BASE + 4)
+#define SYS_open (SYS_BASE + 5)
+#define SYS_close (SYS_BASE + 6)
 #define SYS_gettimeofday (SYS_BASE + 78)
 #define SYS_clone (SYS_BASE + 120)
 #define SYS_rt_sigreturn (SYS_BASE + 173)
@@ -29,10 +32,25 @@
 #define SYS_mincore (SYS_BASE + 219)
 #define SYS_gettid (SYS_BASE + 224)
 #define SYS_tkill (SYS_BASE + 238)
+#define SYS_sched_yield (SYS_BASE + 158)
 
 #define ARM_BASE (SYS_BASE + 0x0f0000)
 #define SYS_ARM_cacheflush (ARM_BASE + 2)
 
+TEXT runtime·open(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_open, R7
+	SWI	$0
+	RET
+
+TEXT runtime·close(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	$SYS_close, R7
+	SWI	$0
+	RET
+
 TEXT runtime·write(SB),7,$0
 	MOVW	0(FP), R0
 	MOVW	4(FP), R1
@@ -41,6 +59,14 @@ TEXT runtime·write(SB),7,$0
 	SWI	$0
 	RET
 
+TEXT runtime·read(SB),7,$0
+	MOVW	0(FP), R0
+	MOVW	4(FP), R1
+	MOVW	8(FP), R2
+	MOVW	$SYS_read, R7
+	SWI	$0
+	RET
+
 TEXT runtime·exit(SB),7,$-4
 	MOVW	0(FP), R0
 	MOVW	$SYS_exit_group, R7
@@ -287,3 +313,7 @@ cascheck:
 TEXT runtime·casp(SB),7,$0
 	B	runtime·cas(SB)
 
+TEXT runtime·osyield(SB),7,$0
+	MOVW	$SYS_sched_yield, R7
+	SWI	$0
+	RET
diff --git a/src/pkg/runtime/linux/thread.c b/src/pkg/runtime/linux/thread.c
index 7c7ca7b4e..8efba2b98 100644
--- a/src/pkg/runtime/linux/thread.c
+++ b/src/pkg/runtime/linux/thread.c
@@ -8,6 +8,11 @@
 #include "stack.h"
 
 extern SigTab runtime·sigtab[];
+static int32 proccount;
+
+int32 runtime·open(uint8*, int32, int32);
+int32 runtime·close(int32);
+int32 runtime·read(int32, void*, int32);
 
 // Linux futex.
 //
@@ -15,11 +20,19 @@ extern SigTab runtime·sigtab[];
 //	futexwakeup(uint32 *addr)
 //
 // Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
-// Futexwakeup wakes up one thread sleeping on addr.
+// Futexwakeup wakes up threads sleeping on addr.
 // Futexsleep is allowed to wake up spuriously.
 
 enum
 {
+	MUTEX_UNLOCKED = 0,
+	MUTEX_LOCKED = 1,
+	MUTEX_SLEEPING = 2,
+
+	ACTIVE_SPIN = 4,
+	ACTIVE_SPIN_CNT = 30,
+	PASSIVE_SPIN = 1,
+
 	FUTEX_WAIT = 0,
 	FUTEX_WAKE = 1,
 
@@ -52,13 +65,13 @@ futexsleep(uint32 *addr, uint32 val)
 	runtime·futex(addr, FUTEX_WAIT, val, &longtime, nil, 0);
 }
 
-// If any procs are sleeping on addr, wake up at least one.
+// If any procs are sleeping on addr, wake up at most cnt.
 static void
-futexwakeup(uint32 *addr)
+futexwakeup(uint32 *addr, uint32 cnt)
 {
 	int64 ret;
 
-	ret = runtime·futex(addr, FUTEX_WAKE, 1, nil, nil, 0);
+	ret = runtime·futex(addr, FUTEX_WAKE, cnt, nil, nil, 0);
 
 	if(ret >= 0)
 		return;
@@ -66,70 +79,96 @@ futexwakeup(uint32 *addr)
 	// I don't know that futex wakeup can return
 	// EAGAIN or EINTR, but if it does, it would be
 	// safe to loop and call futex again.
-
-	runtime·prints("futexwakeup addr=");
-	runtime·printpointer(addr);
-	runtime·prints(" returned ");
-	runtime·printint(ret);
-	runtime·prints("\n");
+	runtime·printf("futexwakeup addr=%p returned %D\n", addr, ret);
 	*(int32*)0x1006 = 0x1006;
 }
 
+static int32
+getproccount(void)
+{
+	int32 fd, rd, cnt, cpustrlen;
+	byte *cpustr, *pos, *bufpos;
+	byte buf[256];
+
+	fd = runtime·open((byte*)"/proc/stat", O_RDONLY|O_CLOEXEC, 0);
+	if(fd == -1)
+		return 1;
+	cnt = 0;
+	bufpos = buf;
+	cpustr = (byte*)"\ncpu";
+	cpustrlen = runtime·findnull(cpustr);
+	for(;;) {
+		rd = runtime·read(fd, bufpos, sizeof(buf)-cpustrlen);
+		if(rd == -1)
+			break;
+		bufpos[rd] = 0;
+		for(pos=buf; pos=runtime·strstr(pos, cpustr); cnt++, pos++) {
+		}
+		if(rd < cpustrlen)
+			break;
+		runtime·memmove(buf, bufpos+rd-cpustrlen+1, cpustrlen-1);
+		bufpos = buf+cpustrlen-1;
+	}
+	runtime·close(fd);
+	return cnt ? cnt : 1;
+}
 
-// Lock and unlock.
-//
-// The lock state is a single 32-bit word that holds
-// a 31-bit count of threads waiting for the lock
-// and a single bit (the low bit) saying whether the lock is held.
-// The uncontended case runs entirely in user space.
-// When contention is detected, we defer to the kernel (futex).
-//
-// A reminder: compare-and-swap runtime·cas(addr, old, new) does
-//	if(*addr == old) { *addr = new; return 1; }
-//	else return 0;
-// but atomically.
-
+// Possible lock states are MUTEX_UNLOCKED, MUTEX_LOCKED and MUTEX_SLEEPING.
+// MUTEX_SLEEPING means that there is presumably at least one sleeping thread.
+// Note that there can be spinning threads during all states - they do not
+// affect mutex's state.
 static void
 futexlock(Lock *l)
 {
-	uint32 v;
+	uint32 i, v, wait, spin;
 
-again:
-	v = l->key;
-	if((v&1) == 0){
-		if(runtime·cas(&l->key, v, v|1)){
-			// Lock wasn't held; we grabbed it.
-			return;
+	// Speculative grab for lock.
+	v = runtime·xchg(&l->key, MUTEX_LOCKED);
+	if(v == MUTEX_UNLOCKED)
+		return;
+
+	// wait is either MUTEX_LOCKED or MUTEX_SLEEPING
+	// depending on whether there is a thread sleeping
+	// on this mutex.  If we ever change l->key from
+	// MUTEX_SLEEPING to some other value, we must be
+	// careful to change it back to MUTEX_SLEEPING before
+	// returning, to ensure that the sleeping thread gets
+	// its wakeup call.
+	wait = v;
+
+	if(proccount == 0)
+		proccount = getproccount();
+
+	// On uniprocessor's, no point spinning.
+	// On multiprocessors, spin for ACTIVE_SPIN attempts.
+	spin = 0;
+	if(proccount > 1)
+		spin = ACTIVE_SPIN;
+
+	for(;;) {
+		// Try for lock, spinning.
+		for(i = 0; i < spin; i++) {
+			while(l->key == MUTEX_UNLOCKED)
+				if(runtime·cas(&l->key, MUTEX_UNLOCKED, wait))
+						return;
+			runtime·procyield(ACTIVE_SPIN_CNT);
 		}
-		goto again;
-	}
 
-	// Lock was held; try to add ourselves to the waiter count.
-	if(!runtime·cas(&l->key, v, v+2))
-		goto again;
-
-	// We're accounted for, now sleep in the kernel.
-	//
-	// We avoid the obvious lock/unlock race because
-	// the kernel won't put us to sleep if l->key has
-	// changed underfoot and is no longer v+2.
-	//
-	// We only really care that (v&1) == 1 (the lock is held),
-	// and in fact there is a futex variant that could
-	// accommodate that check, but let's not get carried away.)
-	futexsleep(&l->key, v+2);
-
-	// We're awake: remove ourselves from the count.
-	for(;;){
-		v = l->key;
-		if(v < 2)
-			runtime·throw("bad lock key");
-		if(runtime·cas(&l->key, v, v-2))
-			break;
-	}
+		// Try for lock, rescheduling.
+		for(i=0; i < PASSIVE_SPIN; i++) {
+			while(l->key == MUTEX_UNLOCKED)
+				if(runtime·cas(&l->key, MUTEX_UNLOCKED, wait))
+					return;
+			runtime·osyield();
+		}
 
-	// Try for the lock again.
-	goto again;
+		// Sleep.
+		v = runtime·xchg(&l->key, MUTEX_SLEEPING);
+		if(v == MUTEX_UNLOCKED)
+			return;
+		wait = MUTEX_SLEEPING;
+		futexsleep(&l->key, MUTEX_SLEEPING);
+	}
 }
 
 static void
@@ -137,34 +176,26 @@ futexunlock(Lock *l)
 {
 	uint32 v;
 
-	// Atomically get value and clear lock bit.
-again:
-	v = l->key;
-	if((v&1) == 0)
+	v = runtime·xchg(&l->key, MUTEX_UNLOCKED);
+	if(v == MUTEX_UNLOCKED)
 		runtime·throw("unlock of unlocked lock");
-	if(!runtime·cas(&l->key, v, v&~1))
-		goto again;
-
-	// If there were waiters, wake one.
-	if(v & ~1)
-		futexwakeup(&l->key);
+	if(v == MUTEX_SLEEPING)
+		futexwakeup(&l->key, 1);
 }
 
 void
 runtime·lock(Lock *l)
 {
-	if(m->locks < 0)
-		runtime·throw("lock count");
-	m->locks++;
+	if(m->locks++ < 0)
+		runtime·throw("runtime·lock: lock count");
 	futexlock(l);
 }
 
 void
 runtime·unlock(Lock *l)
 {
-	m->locks--;
-	if(m->locks < 0)
-		runtime·throw("lock count");
+	if(--m->locks < 0)
+		runtime·throw("runtime·unlock: lock count");
 	futexunlock(l);
 }
 
@@ -175,35 +206,24 @@ runtime·destroylock(Lock*)
 
 
 // One-time notifications.
-//
-// Since the lock/unlock implementation already
-// takes care of sleeping in the kernel, we just reuse it.
-// (But it's a weird use, so it gets its own interface.)
-//
-// We use a lock to represent the event:
-// unlocked == event has happened.
-// Thus the lock starts out locked, and to wait for the
-// event you try to lock the lock.  To signal the event,
-// you unlock the lock.
-
 void
 runtime·noteclear(Note *n)
 {
-	n->lock.key = 0;	// memset(n, 0, sizeof *n)
-	futexlock(&n->lock);
+	n->state = 0;
 }
 
 void
 runtime·notewakeup(Note *n)
 {
-	futexunlock(&n->lock);
+	runtime·xchg(&n->state, 1);
+	futexwakeup(&n->state, 1<<30);
 }
 
 void
 runtime·notesleep(Note *n)
 {
-	futexlock(&n->lock);
-	futexunlock(&n->lock);	// Let other sleepers find out too.
+	while(runtime·atomicload(&n->state) == 0)
+		futexsleep(&n->state, 0);
 }
 
 
diff --git a/src/pkg/runtime/malloc.goc b/src/pkg/runtime/malloc.goc
index 49ab24df8..b9fe36db6 100644
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@@ -18,21 +18,6 @@ extern MStats mstats;	// defined in extern.go
 
 extern volatile int32 runtime·MemProfileRate;
 
-// Same algorithm from chan.c, but a different
-// instance of the static uint32 x.
-// Not protected by a lock - let the threads use
-// the same random number if they like.
-static uint32
-fastrand1(void)
-{
-	static uint32 x = 0x49f6428aUL;
-
-	x += x;
-	if(x & 0x80000000L)
-		x ^= 0x88888eefUL;
-	return x;
-}
-
 // Allocate an object of at least size bytes.
 // Small objects are allocated from the per-thread cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
@@ -53,18 +38,18 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 	if(size == 0)
 		size = 1;
 
-	mstats.nmalloc++;
+	c = m->mcache;
+	c->local_nmalloc++;
 	if(size <= MaxSmallSize) {
 		// Allocate from mcache free lists.
 		sizeclass = runtime·SizeToClass(size);
 		size = runtime·class_to_size[sizeclass];
-		c = m->mcache;
 		v = runtime·MCache_Alloc(c, sizeclass, size, zeroed);
 		if(v == nil)
 			runtime·throw("out of memory");
-		mstats.alloc += size;
-		mstats.total_alloc += size;
-		mstats.by_size[sizeclass].nmalloc++;
+		c->local_alloc += size;
+		c->local_total_alloc += size;
+		c->local_by_size[sizeclass].nmalloc++;
 	} else {
 		// TODO(rsc): Report tracebacks for very large allocations.
 
@@ -76,8 +61,8 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 		if(s == nil)
 			runtime·throw("out of memory");
 		size = npages<<PageShift;
-		mstats.alloc += size;
-		mstats.total_alloc += size;
+		c->local_alloc += size;
+		c->local_total_alloc += size;
 		v = (void*)(s->start << PageShift);
 
 		// setup for mark sweep
@@ -97,7 +82,7 @@ runtime·mallocgc(uintptr size, uint32 flag, int32 dogc, int32 zeroed)
 			// pick next profile time
 			if(rate > 0x3fffffff)	// make 2*rate not overflow
 				rate = 0x3fffffff;
-			m->mcache->next_sample = fastrand1() % (2*rate);
+			m->mcache->next_sample = runtime·fastrand1() % (2*rate);
 		profile:
 			runtime·setblockspecial(v);
 			runtime·MProf_Malloc(v, size);
@@ -143,6 +128,7 @@ runtime·free(void *v)
 
 	// Find size class for v.
 	sizeclass = s->sizeclass;
+	c = m->mcache;
 	if(sizeclass == 0) {
 		// Large object.
 		size = s->npages<<PageShift;
@@ -154,7 +140,6 @@ runtime·free(void *v)
 		runtime·MHeap_Free(&runtime·mheap, s, 1);
 	} else {
 		// Small object.
-		c = m->mcache;
 		size = runtime·class_to_size[sizeclass];
 		if(size > sizeof(uintptr))
 			((uintptr*)v)[1] = 1;	// mark as "needs to be zeroed"
@@ -162,10 +147,10 @@ runtime·free(void *v)
 		// it might coalesce v and other blocks into a bigger span
 		// and change the bitmap further.
 		runtime·markfreed(v, size);
-		mstats.by_size[sizeclass].nfree++;
+		c->local_by_size[sizeclass].nfree++;
 		runtime·MCache_Free(c, v, sizeclass, size);
 	}
-	mstats.alloc -= size;
+	c->local_alloc -= size;
 	if(prof)
 		runtime·MProf_Free(v, size);
 	m->mallocing = 0;
@@ -178,7 +163,7 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	byte *p;
 	MSpan *s;
 
-	mstats.nlookup++;
+	m->mcache->local_nlookup++;
 	s = runtime·MHeap_LookupMaybe(&runtime·mheap, v);
 	if(sp)
 		*sp = s;
@@ -207,9 +192,10 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	}
 
 	n = runtime·class_to_size[s->sizeclass];
-	i = ((byte*)v - p)/n;
-	if(base)
+	if(base) {
+		i = ((byte*)v - p)/n;
 		*base = p + i*n;
+	}
 	if(size)
 		*size = n;
 
@@ -229,6 +215,29 @@ runtime·allocmcache(void)
 	return c;
 }
 
+void
+runtime·purgecachedstats(M* m)
+{
+	MCache *c;
+
+	// Protected by either heap or GC lock.
+	c = m->mcache;
+	mstats.heap_alloc += c->local_cachealloc;
+	c->local_cachealloc = 0;
+	mstats.heap_objects += c->local_objects;
+	c->local_objects = 0;
+	mstats.nmalloc += c->local_nmalloc;
+	c->local_nmalloc = 0;
+	mstats.nfree += c->local_nfree;
+	c->local_nfree = 0;
+	mstats.nlookup += c->local_nlookup;
+	c->local_nlookup = 0;
+	mstats.alloc += c->local_alloc;
+	c->local_alloc= 0;
+	mstats.total_alloc += c->local_total_alloc;
+	c->local_total_alloc= 0;
+}
+
 uintptr runtime·sizeof_C_MStats = sizeof(MStats);
 
 #define MaxArena32 (2U<<30)
@@ -373,46 +382,28 @@ func new(n uint32) (ret *uint8) {
 	ret = runtime·mal(n);
 }
 
-// Stack allocator uses malloc/free most of the time,
-// but if we're in the middle of malloc and need stack,
-// we have to do something else to avoid deadlock.
-// In that case, we fall back on a fixed-size free-list
-// allocator, assuming that inside malloc all the stack
-// frames are small, so that all the stack allocations
-// will be a single size, the minimum (right now, 5k).
-static struct {
-	Lock;
-	FixAlloc;
-} stacks;
-
-enum {
-	FixedStack = StackMin,
-};
-
 void*
 runtime·stackalloc(uint32 n)
 {
-	void *v;
-
 	// Stackalloc must be called on scheduler stack, so that we
 	// never try to grow the stack during the code that stackalloc runs.
 	// Doing so would cause a deadlock (issue 1547).
 	if(g != m->g0)
 		runtime·throw("stackalloc not on scheduler stack");
 
+	// Stack allocator uses malloc/free most of the time,
+	// but if we're in the middle of malloc and need stack,
+	// we have to do something else to avoid deadlock.
+	// In that case, we fall back on a fixed-size free-list
+	// allocator, assuming that inside malloc all the stack
+	// frames are small, so that all the stack allocations
+	// will be a single size, the minimum (right now, 5k).
 	if(m->mallocing || m->gcing || n == FixedStack) {
-		runtime·lock(&stacks);
-		if(stacks.size == 0)
-			runtime·FixAlloc_Init(&stacks, n, runtime·SysAlloc, nil, nil);
-		if(stacks.size != n) {
-			runtime·printf("stackalloc: in malloc, size=%D want %d", (uint64)stacks.size, n);
+		if(n != FixedStack) {
+			runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
 			runtime·throw("stackalloc");
 		}
-		v = runtime·FixAlloc_Alloc(&stacks);
-		mstats.stacks_inuse = stacks.inuse;
-		mstats.stacks_sys = stacks.sys;
-		runtime·unlock(&stacks);
-		return v;
+		return runtime·FixAlloc_Alloc(m->stackalloc);
 	}
 	return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
 }
@@ -421,11 +412,7 @@ void
 runtime·stackfree(void *v, uintptr n)
 {
 	if(m->mallocing || m->gcing || n == FixedStack) {
-		runtime·lock(&stacks);
-		runtime·FixAlloc_Free(&stacks, v);
-		mstats.stacks_inuse = stacks.inuse;
-		mstats.stacks_sys = stacks.sys;
-		runtime·unlock(&stacks);
+		runtime·FixAlloc_Free(m->stackalloc, v);
 		return;
 	}
 	runtime·free(v);
diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h
index 4e2794570..5bc80f4df 100644
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@@ -80,7 +80,6 @@
 // This C code was written with an eye toward translating to Go
 // in the future.  Methods have the form Type_Method(Type *t, ...).
 
-typedef struct FixAlloc	FixAlloc;
 typedef struct MCentral	MCentral;
 typedef struct MHeap	MHeap;
 typedef struct MSpan	MSpan;
@@ -186,10 +185,10 @@ void	runtime·FixAlloc_Free(FixAlloc *f, void *p);
 // Shared with Go: if you edit this structure, also edit extern.go.
 struct MStats
 {
-	// General statistics.  No locking; approximate.
+	// General statistics.
 	uint64	alloc;		// bytes allocated and still in use
 	uint64	total_alloc;	// bytes allocated (even if freed)
-	uint64	sys;		// bytes obtained from system (should be sum of xxx_sys below)
+	uint64	sys;		// bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
 	uint64	nlookup;	// number of pointer lookups
 	uint64	nmalloc;	// number of mallocs
 	uint64	nfree;  // number of frees
@@ -222,7 +221,6 @@ struct MStats
 	bool	debuggc;
 	
 	// Statistics about allocation size classes.
-	// No locking; approximate.
 	struct {
 		uint32 size;
 		uint64 nmalloc;
@@ -268,9 +266,20 @@ struct MCache
 {
 	MCacheList list[NumSizeClasses];
 	uint64 size;
+	int64 local_cachealloc;	// bytes allocated (or freed) from cache since last lock of heap
+	int64 local_objects;	// objects allocated (or freed) from cache since last lock of heap
 	int64 local_alloc;	// bytes allocated (or freed) since last lock of heap
-	int64 local_objects;	// objects allocated (or freed) since last lock of heap
+	int64 local_total_alloc;	// bytes allocated (even if freed) since last lock of heap
+	int64 local_nmalloc;	// number of mallocs since last lock of heap
+	int64 local_nfree;	// number of frees since last lock of heap
+	int64 local_nlookup;	// number of pointer lookups since last lock of heap
 	int32 next_sample;	// trigger heap sample after allocating this many bytes
+	// Statistics about allocation size classes since last lock of heap
+	struct {
+		int64 nmalloc;
+		int64 nfree;
+	} local_by_size[NumSizeClasses];
+	
 };
 
 void*	runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed);
@@ -379,6 +388,7 @@ void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
 void	runtime·unmarkspan(void *v, uintptr size);
 bool	runtime·blockspecial(void*);
 void	runtime·setblockspecial(void*);
+void	runtime·purgecachedstats(M*);
 
 enum
 {
diff --git a/src/pkg/runtime/mcache.c b/src/pkg/runtime/mcache.c
index e40621186..711e938fc 100644
--- a/src/pkg/runtime/mcache.c
+++ b/src/pkg/runtime/mcache.c
@@ -48,7 +48,7 @@ runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed)
 			v->next = nil;
 		}
 	}
-	c->local_alloc += size;
+	c->local_cachealloc += size;
 	c->local_objects++;
 	return v;
 }
@@ -90,7 +90,7 @@ runtime·MCache_Free(MCache *c, void *v, int32 sizeclass, uintptr size)
 	l->list = p;
 	l->nlist++;
 	c->size += size;
-	c->local_alloc -= size;
+	c->local_cachealloc -= size;
 	c->local_objects--;
 
 	if(l->nlist >= MaxMCacheListLen) {
diff --git a/src/pkg/runtime/mem.go b/src/pkg/runtime/mem.go
index c3316d44c..93d155a7f 100644
--- a/src/pkg/runtime/mem.go
+++ b/src/pkg/runtime/mem.go
@@ -62,8 +62,13 @@ func init() {
 }
 
 // MemStats holds statistics about the memory system.
-// The statistics are only approximate, as they are not interlocked on update.
+// The statistics may be out of date, as the information is
+// updated lazily from per-thread caches.
+// Use UpdateMemStats to bring the statistics up to date.
 var MemStats MemStatsType
 
+// UpdateMemStats brings MemStats up to date.
+func UpdateMemStats()
+
 // GC runs a garbage collection.
 func GC()
diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c
index ac6a1fa40..6325aadc6 100644
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -484,6 +484,7 @@ sweep(void)
 			// Mark freed; restore block boundary bit.
 			*bitp = (*bitp & ~(bitMask<<shift)) | (bitBlockBoundary<<shift);
 
+			c = m->mcache;
 			if(s->sizeclass == 0) {
 				// Free large span.
 				runtime·unmarkspan(p, 1<<PageShift);
@@ -491,14 +492,13 @@ sweep(void)
 				runtime·MHeap_Free(&runtime·mheap, s, 1);
 			} else {
 				// Free small object.
-				c = m->mcache;
 				if(size > sizeof(uintptr))
 					((uintptr*)p)[1] = 1;	// mark as "needs to be zeroed"
-				mstats.by_size[s->sizeclass].nfree++;
+				c->local_by_size[s->sizeclass].nfree++;
 				runtime·MCache_Free(c, p, s->sizeclass, size);
 			}
-			mstats.alloc -= size;
-			mstats.nfree++;
+			c->local_alloc -= size;
+			c->local_nfree++;
 		}
 	}
 }
@@ -533,14 +533,26 @@ cachestats(void)
 {
 	M *m;
 	MCache *c;
+	int32 i;
+	uint64 stacks_inuse;
+	uint64 stacks_sys;
 
+	stacks_inuse = 0;
+	stacks_sys = 0;
 	for(m=runtime·allm; m; m=m->alllink) {
+		runtime·purgecachedstats(m);
+		stacks_inuse += m->stackalloc->inuse;
+		stacks_sys += m->stackalloc->sys;
 		c = m->mcache;
-		mstats.heap_alloc += c->local_alloc;
-		c->local_alloc = 0;
-		mstats.heap_objects += c->local_objects;
-		c->local_objects = 0;
+		for(i=0; i<nelem(c->local_by_size); i++) {
+			mstats.by_size[i].nmalloc += c->local_by_size[i].nmalloc;
+			c->local_by_size[i].nmalloc = 0;
+			mstats.by_size[i].nfree += c->local_by_size[i].nfree;
+			c->local_by_size[i].nfree = 0;
+		}
 	}
+	mstats.stacks_inuse = stacks_inuse;
+	mstats.stacks_sys = stacks_sys;
 }
 
 void
@@ -603,6 +615,7 @@ runtime·gc(int32 force)
 	sweep();
 	t2 = runtime·nanotime();
 	stealcache();
+	cachestats();
 
 	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 	m->gcing = 0;
@@ -650,6 +663,22 @@ runtime·gc(int32 force)
 		runtime·gc(1);
 }
 
+void
+runtime·UpdateMemStats(void)
+{
+	// Have to acquire gcsema to stop the world,
+	// because stoptheworld can only be used by
+	// one goroutine at a time, and there might be
+	// a pending garbage collection already calling it.
+	runtime·semacquire(&gcsema);
+	m->gcing = 1;
+	runtime·stoptheworld();
+	cachestats();
+	m->gcing = 0;
+	runtime·semrelease(&gcsema);
+	runtime·starttheworld();
+}
+
 static void
 runfinq(void)
 {
diff --git a/src/pkg/runtime/mheap.c b/src/pkg/runtime/mheap.c
index dde31ce34..37d505681 100644
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@@ -57,10 +57,7 @@ runtime·MHeap_Alloc(MHeap *h, uintptr npage, int32 sizeclass, int32 acct)
 	MSpan *s;
 
 	runtime·lock(h);
-	mstats.heap_alloc += m->mcache->local_alloc;
-	m->mcache->local_alloc = 0;
-	mstats.heap_objects += m->mcache->local_objects;
-	m->mcache->local_objects = 0;
+	runtime·purgecachedstats(m);
 	s = MHeap_AllocLocked(h, npage, sizeclass);
 	if(s != nil) {
 		mstats.heap_inuse += npage<<PageShift;
@@ -258,10 +255,7 @@ void
 runtime·MHeap_Free(MHeap *h, MSpan *s, int32 acct)
 {
 	runtime·lock(h);
-	mstats.heap_alloc += m->mcache->local_alloc;
-	m->mcache->local_alloc = 0;
-	mstats.heap_objects += m->mcache->local_objects;
-	m->mcache->local_objects = 0;
+	runtime·purgecachedstats(m);
 	mstats.heap_inuse -= s->npages<<PageShift;
 	if(acct) {
 		mstats.heap_alloc -= s->npages<<PageShift;
diff --git a/src/pkg/runtime/plan9/mem.c b/src/pkg/runtime/plan9/mem.c
index 9dfdf2cc3..f795b2c01 100644
--- a/src/pkg/runtime/plan9/mem.c
+++ b/src/pkg/runtime/plan9/mem.c
@@ -8,6 +8,7 @@
 
 extern byte end[];
 static byte *bloc = { end };
+static Lock memlock;
 
 enum
 {
@@ -19,23 +20,31 @@ runtime·SysAlloc(uintptr nbytes)
 {
 	uintptr bl;
 	
+	runtime·lock(&memlock);
+	mstats.sys += nbytes;
 	// Plan 9 sbrk from /sys/src/libc/9sys/sbrk.c
 	bl = ((uintptr)bloc + Round) & ~Round;
-	if(runtime·brk_((void*)(bl + nbytes)) < 0)
+	if(runtime·brk_((void*)(bl + nbytes)) < 0) {
+		runtime·unlock(&memlock);
 		return (void*)-1;
+	}
 	bloc = (byte*)bl + nbytes;
+	runtime·unlock(&memlock);
 	return (void*)bl;
 }
 
 void
 runtime·SysFree(void *v, uintptr nbytes)
 {
+	runtime·lock(&memlock);
+	mstats.sys -= nbytes;
 	// from tiny/mem.c
 	// Push pointer back if this is a free
 	// of the most recent SysAlloc.
 	nbytes += (nbytes + Round) & ~Round;
 	if(bloc == (byte*)v+nbytes)
 		bloc -= nbytes;	
+	runtime·unlock(&memlock);
 }
 
 void
diff --git a/src/pkg/runtime/plan9/thread.c b/src/pkg/runtime/plan9/thread.c
index ef9a23e8e..b091c5978 100644
--- a/src/pkg/runtime/plan9/thread.c
+++ b/src/pkg/runtime/plan9/thread.c
@@ -47,11 +47,11 @@ runtime·exit(int32)
 		pid = pid/10;
 	}
 	p = buf;
-	runtime·mcpy((void*)p, (void*)"/proc/", 6);
+	runtime·memmove((void*)p, (void*)"/proc/", 6);
 	p += 6;
 	for(q--; q >= tmp;)
 		*p++ = *q--;
-	runtime·mcpy((void*)p, (void*)"/notepg", 7);
+	runtime·memmove((void*)p, (void*)"/notepg", 7);
 	
 	/* post interrupt note */
 	fd = runtime·open(buf, OWRITE);
@@ -167,3 +167,14 @@ os·sigpipe(void)
 {
 	runtime·throw("too many writes on closed pipe");
 }
+
+/*
+ * placeholder - once notes are implemented,
+ * a signal generating a panic must appear as
+ * a call to this function for correct handling by
+ * traceback.
+ */
+void
+runtime·sigpanic(void)
+{
+}
diff --git a/src/pkg/runtime/print.c b/src/pkg/runtime/print.c
index b8069aa39..3ce779495 100644
--- a/src/pkg/runtime/print.c
+++ b/src/pkg/runtime/print.c
@@ -320,7 +320,7 @@ runtime·printpointer(void *p)
 void
 runtime·printstring(String v)
 {
-	extern int32 runtime·maxstring;
+	extern uint32 runtime·maxstring;
 
 	if(v.len > runtime·maxstring) {
 		runtime·write(2, "[invalid string]", 16);
diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c
index a8f3a796a..6d8f6990b 100644
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@@ -28,10 +28,10 @@ int32	runtime·gcwaiting;
 // Go scheduler
 //
 // The go scheduler's job is to match ready-to-run goroutines (`g's)
-// with waiting-for-work schedulers (`m's).  If there are ready gs
-// and no waiting ms, ready() will start a new m running in a new
-// OS thread, so that all ready gs can run simultaneously, up to a limit.
-// For now, ms never go away.
+// with waiting-for-work schedulers (`m's).  If there are ready g's
+// and no waiting m's, ready() will start a new m running in a new
+// OS thread, so that all ready g's can run simultaneously, up to a limit.
+// For now, m's never go away.
 //
 // By default, Go keeps only one kernel thread (m) running user code
 // at a single time; other threads may be blocked in the operating system.
@@ -41,10 +41,10 @@ int32	runtime·gcwaiting;
 // approximation of the maximum number of cores to use.
 //
 // Even a program that can run without deadlock in a single process
-// might use more ms if given the chance.  For example, the prime
-// sieve will use as many ms as there are primes (up to runtime·sched.mmax),
+// might use more m's if given the chance.  For example, the prime
+// sieve will use as many m's as there are primes (up to runtime·sched.mmax),
 // allowing different stages of the pipeline to execute in parallel.
-// We could revisit this choice, only kicking off new ms for blocking
+// We could revisit this choice, only kicking off new m's for blocking
 // system calls, but that would limit the amount of parallel computation
 // that go would try to do.
 //
@@ -55,27 +55,75 @@ int32	runtime·gcwaiting;
 struct Sched {
 	Lock;
 
-	G *gfree;	// available gs (status == Gdead)
+	G *gfree;	// available g's (status == Gdead)
+	int32 goidgen;
 
-	G *ghead;	// gs waiting to run
+	G *ghead;	// g's waiting to run
 	G *gtail;
-	int32 gwait;	// number of gs waiting to run
-	int32 gcount;	// number of gs that are alive
+	int32 gwait;	// number of g's waiting to run
+	int32 gcount;	// number of g's that are alive
+	int32 grunning;	// number of g's running on cpu or in syscall
 
-	M *mhead;	// ms waiting for work
-	int32 mwait;	// number of ms waiting for work
-	int32 mcount;	// number of ms that have been created
-	int32 mcpu;	// number of ms executing on cpu
-	int32 mcpumax;	// max number of ms allowed on cpu
-	int32 msyscall;	// number of ms in system calls
+	M *mhead;	// m's waiting for work
+	int32 mwait;	// number of m's waiting for work
+	int32 mcount;	// number of m's that have been created
 
-	int32 predawn;	// running initialization, don't run new gs.
+	volatile uint32 atomic;	// atomic scheduling word (see below)
+
+	int32 predawn;		// running initialization, don't run new g's.
 	int32 profilehz;	// cpu profiling rate
 
-	Note	stopped;	// one g can wait here for ms to stop
-	int32 waitstop;	// after setting this flag
+	Note	stopped;	// one g can set waitstop and wait here for m's to stop
+};
+
+// The atomic word in sched is an atomic uint32 that
+// holds these fields.
+//
+//	[15 bits] mcpu		number of m's executing on cpu
+//	[15 bits] mcpumax	max number of m's allowed on cpu
+//	[1 bit] waitstop	some g is waiting on stopped
+//	[1 bit] gwaiting	gwait != 0
+//
+// These fields are the information needed by entersyscall
+// and exitsyscall to decide whether to coordinate with the
+// scheduler.  Packing them into a single machine word lets
+// them use a fast path with a single atomic read/write and
+// no lock/unlock.  This greatly reduces contention in
+// syscall- or cgo-heavy multithreaded programs.
+//
+// Except for entersyscall and exitsyscall, the manipulations
+// to these fields only happen while holding the schedlock,
+// so the routines holding schedlock only need to worry about
+// what entersyscall and exitsyscall do, not the other routines
+// (which also use the schedlock).
+//
+// In particular, entersyscall and exitsyscall only read mcpumax,
+// waitstop, and gwaiting.  They never write them.  Thus, writes to those
+// fields can be done (holding schedlock) without fear of write conflicts.
+// There may still be logic conflicts: for example, the set of waitstop must
+// be conditioned on mcpu >= mcpumax or else the wait may be a
+// spurious sleep.  The Promela model in proc.p verifies these accesses.
+enum {
+	mcpuWidth = 15,
+	mcpuMask = (1<<mcpuWidth) - 1,
+	mcpuShift = 0,
+	mcpumaxShift = mcpuShift + mcpuWidth,
+	waitstopShift = mcpumaxShift + mcpuWidth,
+	gwaitingShift = waitstopShift+1,
+
+	// The max value of GOMAXPROCS is constrained
+	// by the max value we can store in the bit fields
+	// of the atomic word.  Reserve a few high values
+	// so that we can detect accidental decrement
+	// beyond zero.
+	maxgomaxprocs = mcpuMask - 10,
 };
 
+#define atomic_mcpu(v)		(((v)>>mcpuShift)&mcpuMask)
+#define atomic_mcpumax(v)	(((v)>>mcpumaxShift)&mcpuMask)
+#define atomic_waitstop(v)	(((v)>>waitstopShift)&1)
+#define atomic_gwaiting(v)	(((v)>>gwaitingShift)&1)
+
 Sched runtime·sched;
 int32 runtime·gomaxprocs;
 
@@ -93,9 +141,25 @@ static void mput(M*);	// put/get on mhead
 static M* mget(G*);
 static void gfput(G*);	// put/get on gfree
 static G* gfget(void);
-static void matchmg(void);	// match ms to gs
+static void matchmg(void);	// match m's to g's
 static void readylocked(G*);	// ready, but sched is locked
 static void mnextg(M*, G*);
+static void mcommoninit(M*);
+
+void
+setmcpumax(uint32 n)
+{
+	uint32 v, w;
+
+	for(;;) {
+		v = runtime·sched.atomic;
+		w = v;
+		w &= ~(mcpuMask<<mcpumaxShift);
+		w |= n<<mcpumaxShift;
+		if(runtime·cas(&runtime·sched.atomic, v, w))
+			break;
+	}
+}
 
 // The bootstrap sequence is:
 //
@@ -115,10 +179,10 @@ runtime·schedinit(void)
 	int32 n;
 	byte *p;
 
-	runtime·allm = m;
 	m->nomemprof++;
-
 	runtime·mallocinit();
+	mcommoninit(m);
+
 	runtime·goargs();
 	runtime·goenvs();
 
@@ -129,10 +193,12 @@ runtime·schedinit(void)
 
 	runtime·gomaxprocs = 1;
 	p = runtime·getenv("GOMAXPROCS");
-	if(p != nil && (n = runtime·atoi(p)) != 0)
+	if(p != nil && (n = runtime·atoi(p)) != 0) {
+		if(n > maxgomaxprocs)
+			n = maxgomaxprocs;
 		runtime·gomaxprocs = n;
-	runtime·sched.mcpumax = runtime·gomaxprocs;
-	runtime·sched.mcount = 1;
+	}
+	setmcpumax(runtime·gomaxprocs);
 	runtime·sched.predawn = 1;
 
 	m->nomemprof--;
@@ -167,7 +233,7 @@ runtime·initdone(void)
 	mstats.enablegc = 1;
 
 	// If main·init_function started other goroutines,
-	// kick off new ms to handle them, like ready
+	// kick off new m's to handle them, like ready
 	// would have, had it not been pre-dawn.
 	schedlock();
 	matchmg();
@@ -206,6 +272,37 @@ runtime·idlegoroutine(void)
 	g->idlem = m;
 }
 
+static void
+mcommoninit(M *m)
+{
+	// Add to runtime·allm so garbage collector doesn't free m
+	// when it is just in a register or thread-local storage.
+	m->alllink = runtime·allm;
+	// runtime·Cgocalls() iterates over allm w/o schedlock,
+	// so we need to publish it safely.
+	runtime·atomicstorep(&runtime·allm, m);
+
+	m->id = runtime·sched.mcount++;
+	m->fastrand = 0x49f6428aUL + m->id;
+	m->stackalloc = runtime·malloc(sizeof(*m->stackalloc));
+	runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
+}
+
+// Try to increment mcpu.  Report whether succeeded.
+static bool
+canaddmcpu(void)
+{
+	uint32 v;
+
+	for(;;) {
+		v = runtime·sched.atomic;
+		if(atomic_mcpu(v) >= atomic_mcpumax(v))
+			return 0;
+		if(runtime·cas(&runtime·sched.atomic, v, v+(1<<mcpuShift)))
+			return 1;
+	}
+}
+
 // Put on `g' queue.  Sched must be locked.
 static void
 gput(G *g)
@@ -213,11 +310,11 @@ gput(G *g)
 	M *m;
 
 	// If g is wired, hand it off directly.
-	if(runtime·sched.mcpu < runtime·sched.mcpumax && (m = g->lockedm) != nil) {
+	if((m = g->lockedm) != nil && canaddmcpu()) {
 		mnextg(m, g);
 		return;
 	}
-	
+
 	// If g is the idle goroutine for an m, hand it off.
 	if(g->idlem != nil) {
 		if(g->idlem->idleg != nil) {
@@ -236,7 +333,18 @@ gput(G *g)
 	else
 		runtime·sched.gtail->schedlink = g;
 	runtime·sched.gtail = g;
-	runtime·sched.gwait++;
+
+	// increment gwait.
+	// if it transitions to nonzero, set atomic gwaiting bit.
+	if(runtime·sched.gwait++ == 0)
+		runtime·xadd(&runtime·sched.atomic, 1<<gwaitingShift);
+}
+
+// Report whether gget would return something.
+static bool
+haveg(void)
+{
+	return runtime·sched.ghead != nil || m->idleg != nil;
 }
 
 // Get from `g' queue.  Sched must be locked.
@@ -250,7 +358,10 @@ gget(void)
 		runtime·sched.ghead = g->schedlink;
 		if(runtime·sched.ghead == nil)
 			runtime·sched.gtail = nil;
-		runtime·sched.gwait--;
+		// decrement gwait.
+		// if it transitions to zero, clear atomic gwaiting bit.
+		if(--runtime·sched.gwait == 0)
+			runtime·xadd(&runtime·sched.atomic, -1<<gwaitingShift);
 	} else if(m->idleg != nil) {
 		g = m->idleg;
 		m->idleg = nil;
@@ -335,10 +446,11 @@ newprocreadylocked(G *g)
 }
 
 // Pass g to m for running.
+// Caller has already incremented mcpu.
 static void
 mnextg(M *m, G *g)
 {
-	runtime·sched.mcpu++;
+	runtime·sched.grunning++;
 	m->nextg = g;
 	if(m->waitnextg) {
 		m->waitnextg = 0;
@@ -350,18 +462,19 @@ mnextg(M *m, G *g)
 
 // Get the next goroutine that m should run.
 // Sched must be locked on entry, is unlocked on exit.
-// Makes sure that at most $GOMAXPROCS gs are
+// Makes sure that at most $GOMAXPROCS g's are
 // running on cpus (not in system calls) at any given time.
 static G*
 nextgandunlock(void)
 {
 	G *gp;
+	uint32 v;
 
-	if(runtime·sched.mcpu < 0)
-		runtime·throw("negative runtime·sched.mcpu");
+	if(atomic_mcpu(runtime·sched.atomic) >= maxgomaxprocs)
+		runtime·throw("negative mcpu");
 
-	// If there is a g waiting as m->nextg,
-	// mnextg took care of the runtime·sched.mcpu++.
+	// If there is a g waiting as m->nextg, the mcpu++
+	// happened before it was passed to mnextg.
 	if(m->nextg != nil) {
 		gp = m->nextg;
 		m->nextg = nil;
@@ -373,29 +486,62 @@ nextgandunlock(void)
 		// We can only run one g, and it's not available.
 		// Make sure some other cpu is running to handle
 		// the ordinary run queue.
-		if(runtime·sched.gwait != 0)
+		if(runtime·sched.gwait != 0) {
 			matchmg();
+			// m->lockedg might have been on the queue.
+			if(m->nextg != nil) {
+				gp = m->nextg;
+				m->nextg = nil;
+				schedunlock();
+				return gp;
+			}
+		}
 	} else {
 		// Look for work on global queue.
-		while(runtime·sched.mcpu < runtime·sched.mcpumax && (gp=gget()) != nil) {
+		while(haveg() && canaddmcpu()) {
+			gp = gget();
+			if(gp == nil)
+				runtime·throw("gget inconsistency");
+
 			if(gp->lockedm) {
 				mnextg(gp->lockedm, gp);
 				continue;
 			}
-			runtime·sched.mcpu++;		// this m will run gp
+			runtime·sched.grunning++;
 			schedunlock();
 			return gp;
 		}
-		// Otherwise, wait on global m queue.
+
+		// The while loop ended either because the g queue is empty
+		// or because we have maxed out our m procs running go
+		// code (mcpu >= mcpumax).  We need to check that
+		// concurrent actions by entersyscall/exitsyscall cannot
+		// invalidate the decision to end the loop.
+		//
+		// We hold the sched lock, so no one else is manipulating the
+		// g queue or changing mcpumax.  Entersyscall can decrement
+		// mcpu, but if does so when there is something on the g queue,
+		// the gwait bit will be set, so entersyscall will take the slow path
+		// and use the sched lock.  So it cannot invalidate our decision.
+		//
+		// Wait on global m queue.
 		mput(m);
 	}
-	if(runtime·sched.mcpu == 0 && runtime·sched.msyscall == 0)
+
+	v = runtime·atomicload(&runtime·sched.atomic);
+	if(runtime·sched.grunning == 0)
 		runtime·throw("all goroutines are asleep - deadlock!");
 	m->nextg = nil;
 	m->waitnextg = 1;
 	runtime·noteclear(&m->havenextg);
-	if(runtime·sched.waitstop && runtime·sched.mcpu <= runtime·sched.mcpumax) {
-		runtime·sched.waitstop = 0;
+
+	// Stoptheworld is waiting for all but its cpu to go to stop.
+	// Entersyscall might have decremented mcpu too, but if so
+	// it will see the waitstop and take the slow path.
+	// Exitsyscall never increments mcpu beyond mcpumax.
+	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
+		// set waitstop = 0 (known to be 1)
+		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
 		runtime·notewakeup(&runtime·sched.stopped);
 	}
 	schedunlock();
@@ -407,21 +553,34 @@ nextgandunlock(void)
 	return gp;
 }
 
-// TODO(rsc): Remove. This is only temporary,
-// for the mark and sweep collector.
 void
 runtime·stoptheworld(void)
 {
+	uint32 v;
+
 	schedlock();
 	runtime·gcwaiting = 1;
-	runtime·sched.mcpumax = 1;
-	while(runtime·sched.mcpu > 1) {
+
+	setmcpumax(1);
+
+	// while mcpu > 1
+	for(;;) {
+		v = runtime·sched.atomic;
+		if(atomic_mcpu(v) <= 1)
+			break;
+
 		// It would be unsafe for multiple threads to be using
 		// the stopped note at once, but there is only
-		// ever one thread doing garbage collection,
-		// so this is okay.
+		// ever one thread doing garbage collection.
 		runtime·noteclear(&runtime·sched.stopped);
-		runtime·sched.waitstop = 1;
+		if(atomic_waitstop(v))
+			runtime·throw("invalid waitstop");
+
+		// atomic { waitstop = 1 }, predicated on mcpu <= 1 check above
+		// still being true.
+		if(!runtime·cas(&runtime·sched.atomic, v, v+(1<<waitstopShift)))
+			continue;
+
 		schedunlock();
 		runtime·notesleep(&runtime·sched.stopped);
 		schedlock();
@@ -436,7 +595,7 @@ runtime·starttheworld(void)
 {
 	schedlock();
 	runtime·gcwaiting = 0;
-	runtime·sched.mcpumax = runtime·gomaxprocs;
+	setmcpumax(runtime·gomaxprocs);
 	matchmg();
 	schedunlock();
 }
@@ -473,7 +632,7 @@ struct CgoThreadStart
 	void (*fn)(void);
 };
 
-// Kick off new ms as needed (up to mcpumax).
+// Kick off new m's as needed (up to mcpumax).
 // There are already `other' other cpus that will
 // start looking for goroutines shortly.
 // Sched is locked.
@@ -484,17 +643,17 @@ matchmg(void)
 
 	if(m->mallocing || m->gcing)
 		return;
-	while(runtime·sched.mcpu < runtime·sched.mcpumax && (g = gget()) != nil){
-		M *m;
+
+	while(haveg() && canaddmcpu()) {
+		g = gget();
+		if(g == nil)
+			runtime·throw("gget inconsistency");
 
 		// Find the m that will run g.
+		M *m;
 		if((m = mget(g)) == nil){
 			m = runtime·malloc(sizeof(M));
-			// Add to runtime·allm so garbage collector doesn't free m
-			// when it is just in a register or thread-local storage.
-			m->alllink = runtime·allm;
-			runtime·allm = m;
-			m->id = runtime·sched.mcount++;
+			mcommoninit(m);
 
 			if(runtime·iscgo) {
 				CgoThreadStart ts;
@@ -528,6 +687,7 @@ static void
 schedule(G *gp)
 {
 	int32 hz;
+	uint32 v;
 
 	schedlock();
 	if(gp != nil) {
@@ -536,10 +696,13 @@ schedule(G *gp)
 
 		// Just finished running gp.
 		gp->m = nil;
-		runtime·sched.mcpu--;
+		runtime·sched.grunning--;
+
+		// atomic { mcpu-- }
+		v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
+		if(atomic_mcpu(v) > maxgomaxprocs)
+			runtime·throw("negative mcpu in scheduler");
 
-		if(runtime·sched.mcpu < 0)
-			runtime·throw("runtime·sched.mcpu < 0 in scheduler");
 		switch(gp->status){
 		case Grunnable:
 		case Gdead:
@@ -574,7 +737,7 @@ schedule(G *gp)
 	gp->status = Grunning;
 	m->curg = gp;
 	gp->m = m;
-	
+
 	// Check whether the profiler needs to be turned on or off.
 	hz = runtime·sched.profilehz;
 	if(m->profilehz != hz)
@@ -618,31 +781,50 @@ runtime·gosched(void)
 void
 runtime·entersyscall(void)
 {
+	uint32 v;
+
 	if(runtime·sched.predawn)
 		return;
-	schedlock();
-	g->status = Gsyscall;
-	runtime·sched.mcpu--;
-	runtime·sched.msyscall++;
-	if(runtime·sched.gwait != 0)
-		matchmg();
-
-	if(runtime·sched.waitstop && runtime·sched.mcpu <= runtime·sched.mcpumax) {
-		runtime·sched.waitstop = 0;
-		runtime·notewakeup(&runtime·sched.stopped);
-	}
 
 	// Leave SP around for gc and traceback.
-	// Do before schedunlock so that gc
-	// never sees Gsyscall with wrong stack.
 	runtime·gosave(&g->sched);
 	g->gcsp = g->sched.sp;
 	g->gcstack = g->stackbase;
 	g->gcguard = g->stackguard;
+	g->status = Gsyscall;
 	if(g->gcsp < g->gcguard-StackGuard || g->gcstack < g->gcsp) {
-		runtime·printf("entersyscall inconsistent %p [%p,%p]\n", g->gcsp, g->gcguard-StackGuard, g->gcstack);
+		// runtime·printf("entersyscall inconsistent %p [%p,%p]\n",
+		//	g->gcsp, g->gcguard-StackGuard, g->gcstack);
 		runtime·throw("entersyscall");
 	}
+
+	// Fast path.
+	// The slow path inside the schedlock/schedunlock will get
+	// through without stopping if it does:
+	//	mcpu--
+	//	gwait not true
+	//	waitstop && mcpu <= mcpumax not true
+	// If we can do the same with a single atomic add,
+	// then we can skip the locks.
+	v = runtime·xadd(&runtime·sched.atomic, -1<<mcpuShift);
+	if(!atomic_gwaiting(v) && (!atomic_waitstop(v) || atomic_mcpu(v) > atomic_mcpumax(v)))
+		return;
+
+	schedlock();
+	v = runtime·atomicload(&runtime·sched.atomic);
+	if(atomic_gwaiting(v)) {
+		matchmg();
+		v = runtime·atomicload(&runtime·sched.atomic);
+	}
+	if(atomic_waitstop(v) && atomic_mcpu(v) <= atomic_mcpumax(v)) {
+		runtime·xadd(&runtime·sched.atomic, -1<<waitstopShift);
+		runtime·notewakeup(&runtime·sched.stopped);
+	}
+
+	// Re-save sched in case one of the calls
+	// (notewakeup, matchmg) triggered something using it.
+	runtime·gosave(&g->sched);
+
 	schedunlock();
 }
 
@@ -653,22 +835,28 @@ runtime·entersyscall(void)
 void
 runtime·exitsyscall(void)
 {
+	uint32 v;
+
 	if(runtime·sched.predawn)
 		return;
 
-	schedlock();
-	runtime·sched.msyscall--;
-	runtime·sched.mcpu++;
-	// Fast path - if there's room for this m, we're done.
-	if(m->profilehz == runtime·sched.profilehz && runtime·sched.mcpu <= runtime·sched.mcpumax) {
+	// Fast path.
+	// If we can do the mcpu++ bookkeeping and
+	// find that we still have mcpu <= mcpumax, then we can
+	// start executing Go code immediately, without having to
+	// schedlock/schedunlock.
+	v = runtime·xadd(&runtime·sched.atomic, (1<<mcpuShift));
+	if(m->profilehz == runtime·sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) {
 		// There's a cpu for us, so we can run.
 		g->status = Grunning;
 		// Garbage collector isn't running (since we are),
 		// so okay to clear gcstack.
 		g->gcstack = nil;
-		schedunlock();
 		return;
 	}
+
+	schedlock();
+
 	// Tell scheduler to put g back on the run queue:
 	// mostly equivalent to g->status = Grunning,
 	// but keeps the garbage collector from thinking
@@ -676,12 +864,12 @@ runtime·exitsyscall(void)
 	g->readyonstop = 1;
 	schedunlock();
 
-	// Slow path - all the cpus are taken.
+	// All the cpus are taken.
 	// The scheduler will ready g and put this m to sleep.
 	// When the scheduler takes g away from m,
 	// it will undo the runtime·sched.mcpu++ above.
 	runtime·gosched();
-	
+
 	// Gosched returned, so we're allowed to run now.
 	// Delete the gcstack information that we left for
 	// the garbage collector during the system call.
@@ -698,7 +886,7 @@ runtime·oldstack(void)
 	uint32 argsize;
 	byte *sp;
 	G *g1;
-	static int32 goid;
+	int32 goid;
 
 //printf("oldstack m->cret=%p\n", m->cret);
 
@@ -709,9 +897,10 @@ runtime·oldstack(void)
 	argsize = old.argsize;
 	if(argsize > 0) {
 		sp -= argsize;
-		runtime·mcpy(top->argp, sp, argsize);
+		runtime·memmove(top->argp, sp, argsize);
 	}
 	goid = old.gobuf.g->goid;	// fault if g is bad, before gogo
+	USED(goid);
 
 	if(old.free != 0)
 		runtime·stackfree(g1->stackguard - StackGuard, old.free);
@@ -790,7 +979,7 @@ runtime·newstack(void)
 	sp = (byte*)top;
 	if(argsize > 0) {
 		sp -= argsize;
-		runtime·mcpy(sp, m->moreargp, argsize);
+		runtime·memmove(sp, m->moreargp, argsize);
 	}
 	if(thechar == '5') {
 		// caller would have saved its LR below args.
@@ -855,7 +1044,7 @@ void
 runtime·newproc(int32 siz, byte* fn, ...)
 {
 	byte *argp;
-	
+
 	if(thechar == '5')
 		argp = (byte*)(&fn+2);  // skip caller's saved LR
 	else
@@ -873,8 +1062,13 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 //printf("newproc1 %p %p narg=%d nret=%d\n", fn, argp, narg, nret);
 	siz = narg + nret;
 	siz = (siz+7) & ~7;
-	if(siz > 1024)
-		runtime·throw("runtime.newproc: too many args");
+	
+	// We could instead create a secondary stack frame
+	// and make it look like goexit was on the original but
+	// the call to the actual goroutine function was split.
+	// Not worth it: this is almost always an error.
+	if(siz > StackMin - 1024)
+		runtime·throw("runtime.newproc: function arguments too large for new goroutine");
 
 	schedlock();
 
@@ -891,7 +1085,7 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 
 	sp = newg->stackbase;
 	sp -= siz;
-	runtime·mcpy(sp, argp, narg);
+	runtime·memmove(sp, argp, narg);
 	if(thechar == '5') {
 		// caller's LR
 		sp -= sizeof(void*);
@@ -905,8 +1099,8 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc)
 	newg->gopc = (uintptr)callerpc;
 
 	runtime·sched.gcount++;
-	runtime·goidgen++;
-	newg->goid = runtime·goidgen;
+	runtime·sched.goidgen++;
+	newg->goid = runtime·sched.goidgen;
 
 	newprocreadylocked(newg);
 	schedunlock();
@@ -929,11 +1123,11 @@ runtime·deferproc(int32 siz, byte* fn, ...)
 		d->argp = (byte*)(&fn+2);  // skip caller's saved link register
 	else
 		d->argp = (byte*)(&fn+1);
-	runtime·mcpy(d->args, d->argp, d->siz);
+	runtime·memmove(d->args, d->argp, d->siz);
 
 	d->link = g->defer;
 	g->defer = d;
-	
+
 	// deferproc returns 0 normally.
 	// a deferred func that stops a panic
 	// makes the deferproc return 1.
@@ -956,7 +1150,7 @@ runtime·deferreturn(uintptr arg0)
 	argp = (byte*)&arg0;
 	if(d->argp != argp)
 		return;
-	runtime·mcpy(argp, d->args, d->siz);
+	runtime·memmove(argp, d->args, d->siz);
 	g->defer = d->link;
 	fn = d->fn;
 	runtime·free(d);
@@ -965,9 +1159,9 @@ runtime·deferreturn(uintptr arg0)
 
 static void
 rundefer(void)
-{	
+{
 	Defer *d;
-	
+
 	while((d = g->defer) != nil) {
 		g->defer = d->link;
 		reflect·call(d->fn, d->args, d->siz);
@@ -982,7 +1176,7 @@ unwindstack(G *gp, byte *sp)
 {
 	Stktop *top;
 	byte *stk;
-	
+
 	// Must be called from a different goroutine, usually m->g0.
 	if(g == gp)
 		runtime·throw("unwindstack on self");
@@ -1018,7 +1212,7 @@ printpanics(Panic *p)
 }
 
 static void recovery(G*);
-	
+
 void
 runtime·panic(Eface e)
 {
@@ -1068,7 +1262,7 @@ recovery(G *gp)
 	// Rewind gp's stack; we're running on m->g0's stack.
 	d = gp->defer;
 	gp->defer = d->link;
-	
+
 	// Unwind to the stack frame with d's arguments in it.
 	unwindstack(gp, d->argp);
 
@@ -1216,25 +1410,29 @@ int32
 runtime·gomaxprocsfunc(int32 n)
 {
 	int32 ret;
+	uint32 v;
 
 	schedlock();
 	ret = runtime·gomaxprocs;
-	if (n <= 0)
+	if(n <= 0)
 		n = ret;
+	if(n > maxgomaxprocs)
+		n = maxgomaxprocs;
 	runtime·gomaxprocs = n;
- 	if (runtime·gcwaiting != 0) {
- 		if (runtime·sched.mcpumax != 1)
- 			runtime·throw("invalid runtime·sched.mcpumax during gc");
+ 	if(runtime·gcwaiting != 0) {
+ 		if(atomic_mcpumax(runtime·sched.atomic) != 1)
+ 			runtime·throw("invalid mcpumax during gc");
 		schedunlock();
 		return ret;
 	}
-	runtime·sched.mcpumax = n;
-	// handle fewer procs?
-	if(runtime·sched.mcpu > runtime·sched.mcpumax) {
+
+	setmcpumax(n);
+
+	// If there are now fewer allowed procs
+	// than procs running, stop.
+	v = runtime·atomicload(&runtime·sched.atomic);
+	if(atomic_mcpu(v) > n) {
 		schedunlock();
-		// just give up the cpu.
-		// we'll only get rescheduled once the
-		// number has come down.
 		runtime·gosched();
 		return ret;
 	}
@@ -1301,10 +1499,10 @@ void
 runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp)
 {
 	int32 n;
-	
+
 	if(prof.fn == nil || prof.hz == 0)
 		return;
-	
+
 	runtime·lock(&prof);
 	if(prof.fn == nil) {
 		runtime·unlock(&prof);
@@ -1339,7 +1537,7 @@ runtime·setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
 	runtime·lock(&runtime·sched);
 	runtime·sched.profilehz = hz;
 	runtime·unlock(&runtime·sched);
-	
+
 	if(hz != 0)
 		runtime·resetcpuprofiler(hz);
 }
@@ -1355,11 +1553,11 @@ os·setenv_c(String k, String v)
 		return;
 
 	arg[0] = runtime·malloc(k.len + 1);
-	runtime·mcpy(arg[0], k.str, k.len);
+	runtime·memmove(arg[0], k.str, k.len);
 	arg[0][k.len] = 0;
 
 	arg[1] = runtime·malloc(v.len + 1);
-	runtime·mcpy(arg[1], v.str, v.len);
+	runtime·memmove(arg[1], v.str, v.len);
 	arg[1][v.len] = 0;
 
 	runtime·asmcgocall(libcgo_setenv, arg);
diff --git a/src/pkg/runtime/proc.p b/src/pkg/runtime/proc.p
new file mode 100644
index 000000000..f0b46de61
--- /dev/null
+++ b/src/pkg/runtime/proc.p
@@ -0,0 +1,526 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+model for proc.c as of 2011/07/22.
+
+takes 4900 seconds to explore 1189070 states
+with G=3, var_gomaxprocs=1
+on a Core i7 L640 2.13 GHz Lenovo X201s.
+
+rm -f proc.p.trail pan.* pan
+spin -a proc.p
+gcc -DSAFETY -DREACH -DMEMLIM'='4000 -o pan pan.c
+pan -w28 -n -i -m500000
+test -f proc.p.trail && pan -r proc.p.trail
+*/
+
+/*
+ * scheduling parameters
+ */
+
+/*
+ * the number of goroutines G doubles as the maximum
+ * number of OS threads; the max is reachable when all
+ * the goroutines are blocked in system calls.
+ */
+#define G 3
+
+/*
+ * whether to allow gomaxprocs to vary during execution.
+ * enabling this checks the scheduler even when code is
+ * calling GOMAXPROCS, but it also slows down the verification
+ * by about 10x.
+ */
+#define var_gomaxprocs 1  /* allow gomaxprocs to vary */
+
+/* gomaxprocs */
+#if var_gomaxprocs
+byte gomaxprocs = 3;
+#else
+#define gomaxprocs 3
+#endif
+
+/* queue of waiting M's: sched_mhead[:mwait] */
+byte mwait;
+byte sched_mhead[G];
+
+/* garbage collector state */
+bit gc_lock, gcwaiting;
+
+/* goroutines sleeping, waiting to run */
+byte gsleep, gwait;
+
+/* scheduler state */
+bit sched_lock;
+bit sched_stopped;
+bit atomic_gwaiting, atomic_waitstop;
+byte atomic_mcpu, atomic_mcpumax;
+
+/* M struct fields - state for handing off g to m. */
+bit m_waitnextg[G];
+bit m_havenextg[G];
+bit m_nextg[G];
+
+/*
+ * opt_atomic/opt_dstep mark atomic/deterministics
+ * sequences that are marked only for reasons of
+ * optimization, not for correctness of the algorithms.
+ *
+ * in general any code that runs while holding the
+ * schedlock and does not refer to or modify the atomic_*
+ * fields can be marked atomic/dstep without affecting
+ * the usefulness of the model.  since we trust the lock
+ * implementation, what we really want to test is the
+ * interleaving of the atomic fast paths with entersyscall
+ * and exitsyscall.
+ */
+#define opt_atomic atomic
+#define opt_dstep d_step
+
+/* locks */
+inline lock(x) {
+	d_step { x == 0; x = 1 }
+}
+
+inline unlock(x) {
+	d_step { assert x == 1; x = 0 }
+}
+
+/* notes */
+inline noteclear(x) {
+	x = 0
+}
+
+inline notesleep(x) {
+	x == 1
+}
+
+inline notewakeup(x) {
+	opt_dstep { assert x == 0; x = 1 }
+}
+
+/*
+ * scheduler
+ */
+inline schedlock() {
+	lock(sched_lock)
+}
+
+inline schedunlock() {
+	unlock(sched_lock)
+}
+
+/*
+ * canaddmcpu is like the C function but takes
+ * an extra argument to include in the test, to model
+ * "cannget() && canaddmcpu()" as "canaddmcpu(cangget())"
+ */
+inline canaddmcpu(g) {
+	d_step {
+		g && atomic_mcpu < atomic_mcpumax;
+		atomic_mcpu++;
+	}
+}
+
+/*
+ * gput is like the C function.
+ * instead of tracking goroutines explicitly we
+ * maintain only the count of the number of
+ * waiting goroutines.
+ */
+inline gput() {
+	/* omitted: lockedm, idlem concerns */
+	opt_dstep {
+		gwait++;
+		if
+		:: gwait == 1 ->
+			atomic_gwaiting = 1
+		:: else
+		fi
+	}
+}
+
+/*
+ * cangget is a macro so it can be passed to
+ * canaddmcpu (see above).
+ */
+#define cangget()  (gwait>0)
+
+/*
+ * gget is like the C function.
+ */
+inline gget() {
+	opt_dstep {
+		assert gwait > 0;
+		gwait--;
+		if
+		:: gwait == 0 ->
+			atomic_gwaiting = 0
+		:: else
+		fi
+	}
+}
+
+/*
+ * mput is like the C function.
+ * here we do keep an explicit list of waiting M's,
+ * so that we know which ones can be awakened.
+ * we use _pid-1 because the monitor is proc 0.
+ */
+inline mput() {
+	opt_dstep {
+		sched_mhead[mwait] = _pid - 1;
+		mwait++
+	}
+}
+
+/*
+ * mnextg is like the C function mnextg(m, g).
+ * it passes an unspecified goroutine to m to start running.
+ */
+inline mnextg(m) {
+	opt_dstep {
+		m_nextg[m] = 1;
+		if
+		:: m_waitnextg[m] ->
+			m_waitnextg[m] = 0;
+			notewakeup(m_havenextg[m])
+		:: else
+		fi
+	}
+}
+
+/*
+ * mgetnextg handles the main m handoff in matchmg.
+ * it is like mget() || new M followed by mnextg(m, g),
+ * but combined to avoid a local variable.
+ * unlike the C code, a new M simply assumes it is
+ * running a g instead of using the mnextg coordination
+ * to obtain one.
+ */
+inline mgetnextg() {
+	opt_atomic {
+		if
+		:: mwait > 0 ->
+			mwait--;
+			mnextg(sched_mhead[mwait]);
+			sched_mhead[mwait] = 0;
+		:: else ->
+			run mstart();
+		fi
+	}
+}
+
+/*
+ * nextgandunlock is like the C function.
+ * it pulls a g off the queue or else waits for one.
+ */
+inline nextgandunlock() {
+	assert atomic_mcpu <= G;
+
+	if
+	:: m_nextg[_pid-1] ->
+		m_nextg[_pid-1] = 0;
+		schedunlock();
+	:: canaddmcpu(!m_nextg[_pid-1] && cangget()) ->
+		gget();
+		schedunlock();
+	:: else ->
+		opt_dstep {
+			mput();
+			m_nextg[_pid-1] = 0;
+			m_waitnextg[_pid-1] = 1;
+			noteclear(m_havenextg[_pid-1]);
+		}
+		if
+		:: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
+			atomic_waitstop = 0;
+			notewakeup(sched_stopped)
+		:: else
+		fi;
+		schedunlock();
+		opt_dstep {
+			notesleep(m_havenextg[_pid-1]);
+			assert m_nextg[_pid-1];
+			m_nextg[_pid-1] = 0;
+		}
+	fi
+}
+
+/*
+ * stoptheworld is like the C function.
+ */
+inline stoptheworld() {
+	schedlock();
+	gcwaiting = 1;
+	atomic_mcpumax = 1;
+	do
+	:: d_step { atomic_mcpu > 1 ->
+		noteclear(sched_stopped);
+		assert !atomic_waitstop;
+		atomic_waitstop = 1 }
+		schedunlock();
+		notesleep(sched_stopped);
+		schedlock();
+	:: else ->
+		break
+	od;
+	schedunlock();
+}
+
+/*
+ * starttheworld is like the C function.
+ */
+inline starttheworld() {
+	schedlock();
+	gcwaiting = 0;
+	atomic_mcpumax = gomaxprocs;
+	matchmg();
+	schedunlock();
+}
+
+/*
+ * matchmg is like the C function.
+ */
+inline matchmg() {
+	do
+	:: canaddmcpu(cangget()) ->
+		gget();
+		mgetnextg();
+	:: else -> break
+	od
+}
+
+/*
+ * ready is like the C function.
+ * it puts a g on the run queue.
+ */
+inline ready() {
+	schedlock();
+	gput()
+	matchmg()
+	schedunlock()
+}
+
+/*
+ * schedule simulates the C scheduler.
+ * it assumes that there is always a goroutine
+ * running already, and the goroutine has entered
+ * the scheduler for an unspecified reason,
+ * either to yield or to block.
+ */
+inline schedule() {
+	schedlock();
+
+	mustsched = 0;
+	atomic_mcpu--;
+	assert atomic_mcpu <= G;
+	if
+	:: skip ->
+		// goroutine yields, still runnable
+		gput();
+	:: gsleep+1 < G ->
+		// goroutine goes to sleep (but there is another that can wake it)
+		gsleep++
+	fi;
+
+	// Find goroutine to run.
+	nextgandunlock()
+}
+
+/*
+ * schedpend is > 0 if a goroutine is about to committed to
+ * entering the scheduler but has not yet done so.
+ * Just as we don't test for the undesirable conditions when a
+ * goroutine is in the scheduler, we don't test for them when
+ * a goroutine will be in the scheduler shortly.
+ * Modeling this state lets us replace mcpu cas loops with
+ * simpler mcpu atomic adds.
+ */
+byte schedpend;
+
+/*
+ * entersyscall is like the C function.
+ */
+inline entersyscall() {
+	bit willsched;
+
+	/*
+	 * Fast path.  Check all the conditions tested during schedlock/schedunlock
+	 * below, and if we can get through the whole thing without stopping, run it
+	 * in one atomic cas-based step.
+	 */
+	atomic {
+		atomic_mcpu--;
+		if
+		:: atomic_gwaiting ->
+			skip
+		:: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
+			skip
+		:: else ->
+			goto Lreturn_entersyscall;
+		fi;
+		willsched = 1;
+		schedpend++;
+	}
+
+	/*
+	 * Normal path.
+	 */
+	schedlock()
+	opt_dstep {
+		if
+		:: willsched ->
+			schedpend--;
+			willsched = 0
+		:: else
+		fi
+	}
+	if
+	:: atomic_gwaiting ->
+		matchmg()
+	:: else
+	fi;
+	if
+	:: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
+		atomic_waitstop = 0;
+		notewakeup(sched_stopped)
+	:: else
+	fi;
+	schedunlock();
+Lreturn_entersyscall:
+	skip
+}
+
+/*
+ * exitsyscall is like the C function.
+ */
+inline exitsyscall() {
+	/*
+	 * Fast path.  If there's a cpu available, use it.
+	 */
+	atomic {
+		// omitted profilehz check
+		atomic_mcpu++;
+		if
+		:: atomic_mcpu >= atomic_mcpumax ->
+			skip
+		:: else ->
+			goto Lreturn_exitsyscall
+		fi
+	}
+
+	/*
+	 * Normal path.
+	 */
+	schedlock();
+	d_step {
+		if
+		:: atomic_mcpu <= atomic_mcpumax ->
+			skip
+		:: else ->
+			mustsched = 1
+		fi
+	}
+	schedunlock()
+Lreturn_exitsyscall:
+	skip
+}
+
+#if var_gomaxprocs
+inline gomaxprocsfunc() {
+	schedlock();
+	opt_atomic {
+		if
+		:: gomaxprocs != 1 -> gomaxprocs = 1
+		:: gomaxprocs != 2 -> gomaxprocs = 2
+		:: gomaxprocs != 3 -> gomaxprocs = 3
+		fi;
+	}
+	if
+	:: gcwaiting != 0 ->
+		assert atomic_mcpumax == 1
+	:: else ->
+		atomic_mcpumax = gomaxprocs;
+		if
+		:: atomic_mcpu > gomaxprocs ->
+			mustsched = 1
+		:: else ->
+			matchmg()
+		fi
+	fi;
+	schedunlock();
+}
+#endif
+
+/*
+ * mstart is the entry point for a new M.
+ * our model of an M is always running some
+ * unspecified goroutine.
+ */
+proctype mstart() {
+	/*
+	 * mustsched is true if the goroutine must enter the
+	 * scheduler instead of continuing to execute.
+	 */
+	bit mustsched;
+
+	do
+	:: skip ->
+		// goroutine reschedules.
+		schedule()
+	:: !mustsched ->
+		// goroutine does something.
+		if
+		:: skip ->
+			// goroutine executes system call
+			entersyscall();
+			exitsyscall()
+		:: atomic { gsleep > 0; gsleep-- } ->
+			// goroutine wakes another goroutine
+			ready()
+		:: lock(gc_lock) ->
+			// goroutine runs a garbage collection
+			stoptheworld();
+			starttheworld();
+			unlock(gc_lock)
+#if var_gomaxprocs
+		:: skip ->
+			// goroutine picks a new gomaxprocs
+			gomaxprocsfunc()
+#endif
+		fi
+	od;
+
+	assert 0;
+}
+
+/*
+ * monitor initializes the scheduler state
+ * and then watches for impossible conditions.
+ */
+active proctype monitor() {
+	opt_dstep {
+		byte i = 1;
+		do
+		:: i < G ->
+			gput();
+			i++
+		:: else -> break
+		od;
+		atomic_mcpu = 1;
+		atomic_mcpumax = 1;
+	}
+	run mstart();
+
+	do
+	// Should never have goroutines waiting with procs available.
+	:: !sched_lock && schedpend==0 && gwait > 0 && atomic_mcpu < atomic_mcpumax ->
+		assert 0
+	// Should never have gc waiting for stop if things have already stopped.
+	:: !sched_lock && schedpend==0 && atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
+		assert 0
+	od
+}
diff --git a/src/pkg/runtime/proc_test.go b/src/pkg/runtime/proc_test.go
index cac4f9eea..32111080a 100644
--- a/src/pkg/runtime/proc_test.go
+++ b/src/pkg/runtime/proc_test.go
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
 	"runtime"
+	"sync/atomic"
 	"testing"
 )
 
@@ -44,3 +45,81 @@ func TestStopTheWorldDeadlock(t *testing.T) {
 	stop <- true
 	runtime.GOMAXPROCS(maxprocs)
 }
+
+func stackGrowthRecursive(i int) {
+	var pad [128]uint64
+	if i != 0 && pad[0] == 0 {
+		stackGrowthRecursive(i - 1)
+	}
+}
+
+func BenchmarkStackGrowth(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for g := 0; g < CallsPerSched; g++ {
+					stackGrowthRecursive(10)
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkSyscall(b *testing.B) {
+	const CallsPerSched = 1000
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for g := 0; g < CallsPerSched; g++ {
+					runtime.Entersyscall()
+					runtime.Exitsyscall()
+				}
+			}
+			c <- true
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
+
+func BenchmarkSyscallWork(b *testing.B) {
+	const CallsPerSched = 1000
+	const LocalWork = 100
+	procs := runtime.GOMAXPROCS(-1)
+	N := int32(b.N / CallsPerSched)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			foo := 42
+			for atomic.AddInt32(&N, -1) >= 0 {
+				runtime.Gosched()
+				for g := 0; g < CallsPerSched; g++ {
+					runtime.Entersyscall()
+					for i := 0; i < LocalWork; i++ {
+						foo *= 2
+						foo /= 2
+					}
+					runtime.Exitsyscall()
+				}
+			}
+			c <- foo == 42
+		}()
+	}
+	for p := 0; p < procs; p++ {
+		<-c
+	}
+}
diff --git a/src/pkg/runtime/runtime.c b/src/pkg/runtime/runtime.c
index 1a3653f10..c572897d2 100644
--- a/src/pkg/runtime/runtime.c
+++ b/src/pkg/runtime/runtime.c
@@ -11,6 +11,14 @@ enum {
 
 uint32	runtime·panicking;
 
+/*
+ * We assume that all architectures turn faults and the like
+ * into apparent calls to runtime.sigpanic.  If we see a "call"
+ * to runtime.sigpanic, we do not back up the PC to find the
+ * line number of the CALL instruction, because there is no CALL.
+ */
+void	runtime·sigpanic(void);
+
 int32
 runtime·gotraceback(void)
 {
@@ -116,17 +124,6 @@ runtime·panicstring(int8 *s)
 	runtime·panic(err);
 }
 
-void
-runtime·mcpy(byte *t, byte *f, uint32 n)
-{
-	while(n > 0) {
-		*t = *f;
-		t++;
-		f++;
-		n--;
-	}
-}
-
 int32
 runtime·mcmp(byte *s1, byte *s2, uint32 n)
 {
@@ -218,20 +215,6 @@ runtime·goenvs_unix(void)
 	os·Envs.cap = n;
 }
 
-// Atomic add and return new value.
-uint32
-runtime·xadd(uint32 volatile *val, int32 delta)
-{
-	uint32 oval, nval;
-
-	for(;;){
-		oval = *val;
-		nval = oval + delta;
-		if(runtime·cas(val, oval, nval))
-			return nval;
-	}
-}
-
 byte*
 runtime·getenv(int8 *s)
 {
@@ -406,18 +389,11 @@ memprint(uint32 s, void *a)
 static void
 memcopy(uint32 s, void *a, void *b)
 {
-	byte *ba, *bb;
-	uint32 i;
-
-	ba = a;
-	bb = b;
-	if(bb == nil) {
-		for(i=0; i<s; i++)
-			ba[i] = 0;
+	if(b == nil) {
+		runtime·memclr(a,s);
 		return;
 	}
-	for(i=0; i<s; i++)
-		ba[i] = bb[i];
+	runtime·memmove(a,b,s);
 }
 
 static uint32
@@ -551,25 +527,35 @@ runtime·nanotime(void)
 void
 runtime·Caller(int32 skip, uintptr retpc, String retfile, int32 retline, bool retbool)
 {
-	Func *f;
+	Func *f, *g;
 	uintptr pc;
-
-	if(runtime·callers(1+skip, &retpc, 1) == 0) {
+	uintptr rpc[2];
+
+	/*
+	 * Ask for two PCs: the one we were asked for
+	 * and what it called, so that we can see if it
+	 * "called" sigpanic.
+	 */
+	retpc = 0;
+	if(runtime·callers(1+skip-1, rpc, 2) < 2) {
 		retfile = runtime·emptystring;
 		retline = 0;
 		retbool = false;
-	} else if((f = runtime·findfunc(retpc)) == nil) {
+	} else if((f = runtime·findfunc(rpc[1])) == nil) {
 		retfile = runtime·emptystring;
 		retline = 0;
 		retbool = true;  // have retpc at least
 	} else {
+		retpc = rpc[1];
 		retfile = f->src;
 		pc = retpc;
-		if(pc > f->entry)
+		g = runtime·findfunc(rpc[0]);
+		if(pc > f->entry && (g == nil || g->entry != (uintptr)runtime·sigpanic))
 			pc--;
 		retline = runtime·funcline(f, pc);
 		retbool = true;
 	}
+	FLUSH(&retpc);
 	FLUSH(&retfile);
 	FLUSH(&retline);
 	FLUSH(&retbool);
@@ -588,3 +574,16 @@ runtime·FuncForPC(uintptr pc, void *retf)
 	retf = runtime·findfunc(pc);
 	FLUSH(&retf);
 }
+
+uint32
+runtime·fastrand1(void)
+{
+	uint32 x;
+
+	x = m->fastrand;
+	x += x;
+	if(x & 0x80000000L)
+		x ^= 0x88888eefUL;
+	m->fastrand = x;
+	return x;
+}
diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h
index ad5da0a96..44511da83 100644
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@@ -57,6 +57,7 @@ typedef	struct	String		String;
 typedef	struct	Usema		Usema;
 typedef	struct	SigTab		SigTab;
 typedef	struct	MCache		MCache;
+typedef struct	FixAlloc	FixAlloc;
 typedef	struct	Iface		Iface;
 typedef	struct	Itab		Itab;
 typedef	struct	Eface		Eface;
@@ -130,7 +131,10 @@ struct	Usema
 union	Note
 {
 	struct {	// Linux
-		Lock	lock;
+		uint32	state;
+	};
+	struct {	// Windows
+		Lock lock;
 	};
 	struct {	// OS X
 		int32	wakeup;
@@ -229,12 +233,15 @@ struct	M
 	int32	waitnextg;
 	int32	dying;
 	int32	profilehz;
+	uint32	fastrand;
+	uint64	ncgocall;
 	Note	havenextg;
 	G*	nextg;
 	M*	alllink;	// on allm
 	M*	schedlink;
 	uint32	machport;	// Return address for Mach IPC (OS X)
 	MCache	*mcache;
+	FixAlloc	*stackalloc;
 	G*	lockedg;
 	G*	idleg;
 	uint32	freglo[16];	// D[i] lsb and F[i]
@@ -368,7 +375,6 @@ extern	Alg	runtime·algarray[Amax];
 extern	String	runtime·emptystring;
 G*	runtime·allg;
 M*	runtime·allm;
-int32	runtime·goidgen;
 extern	int32	runtime·gomaxprocs;
 extern	uint32	runtime·panicking;
 extern	int32	runtime·gcwaiting;		// gc is waiting to run
@@ -379,6 +385,7 @@ extern	bool	runtime·iscgo;
  * common functions and data
  */
 int32	runtime·strcmp(byte*, byte*);
+byte*	runtime·strstr(byte*, byte*);
 int32	runtime·findnull(byte*);
 int32	runtime·findnullw(uint16*);
 void	runtime·dump(byte*, int32);
@@ -404,13 +411,13 @@ uint32	runtime·rnd(uint32, uint32);
 void	runtime·prints(int8*);
 void	runtime·printf(int8*, ...);
 byte*	runtime·mchr(byte*, byte, byte*);
-void	runtime·mcpy(byte*, byte*, uint32);
 int32	runtime·mcmp(byte*, byte*, uint32);
 void	runtime·memmove(void*, void*, uint32);
 void*	runtime·mal(uintptr);
 String	runtime·catstring(String, String);
 String	runtime·gostring(byte*);
 String  runtime·gostringn(byte*, int32);
+Slice	runtime·gobytes(byte*, int32);
 String	runtime·gostringnocopy(byte*);
 String	runtime·gostringw(uint16*);
 void	runtime·initsig(int32);
@@ -424,7 +431,11 @@ bool	runtime·casp(void**, void*, void*);
 // Don't confuse with XADD x86 instruction,
 // this one is actually 'addx', that is, add-and-fetch.
 uint32	runtime·xadd(uint32 volatile*, int32);
-uint32  runtime·atomicload(uint32 volatile*);
+uint32	runtime·xchg(uint32 volatile*, uint32);
+uint32	runtime·atomicload(uint32 volatile*);
+void	runtime·atomicstore(uint32 volatile*, uint32);
+void*	runtime·atomicloadp(void* volatile*);
+void	runtime·atomicstorep(void* volatile*, void*);
 void	runtime·jmpdefer(byte*, void*);
 void	runtime·exit1(int32);
 void	runtime·ready(G*);
@@ -454,6 +465,7 @@ void	runtime·runpanic(Panic*);
 void*	runtime·getcallersp(void*);
 int32	runtime·mcount(void);
 void	runtime·mcall(void(*)(G*));
+uint32	runtime·fastrand1(void);
 
 void	runtime·exit(int32);
 void	runtime·breakpoint(void);
@@ -590,6 +602,8 @@ void	runtime·semacquire(uint32*);
 void	runtime·semrelease(uint32*);
 String	runtime·signame(int32 sig);
 int32	runtime·gomaxprocsfunc(int32 n);
+void	runtime·procyield(uint32);
+void	runtime·osyield(void);
 
 void	runtime·mapassign(Hmap*, byte*, byte*);
 void	runtime·mapaccess(Hmap*, byte*, byte*, bool*);
diff --git a/src/pkg/runtime/slice.c b/src/pkg/runtime/slice.c
index 9146c177f..70534279b 100644
--- a/src/pkg/runtime/slice.c
+++ b/src/pkg/runtime/slice.c
@@ -20,7 +20,7 @@ runtime·makeslice(SliceType *t, int64 len, int64 cap, Slice ret)
 {
 	if(len < 0 || (int32)len != len)
 		runtime·panicstring("makeslice: len out of range");
-	if(cap < len || (int32)cap != cap || cap > ((uintptr)-1) / t->elem->size)
+	if(cap < len || (int32)cap != cap || t->elem->size > 0 && cap > ((uintptr)-1) / t->elem->size)
 		runtime·panicstring("makeslice: cap out of range");
 
 	makeslice1(t, len, cap, &ret);
diff --git a/src/pkg/runtime/stack.h b/src/pkg/runtime/stack.h
index 2b6b0e387..44d5533f4 100644
--- a/src/pkg/runtime/stack.h
+++ b/src/pkg/runtime/stack.h
@@ -71,6 +71,7 @@ enum {
 	// If the amount needed for the splitting frame + StackExtra
 	// is less than this number, the stack will have this size instead.
 	StackMin = 4096,
+	FixedStack = StackMin + StackSystem,
 
 	// Functions that need frames bigger than this call morestack
 	// unconditionally.  That is, on entry to a function it is assumed
diff --git a/src/pkg/runtime/string.goc b/src/pkg/runtime/string.goc
index b72aa937c..48bf3183b 100644
--- a/src/pkg/runtime/string.goc
+++ b/src/pkg/runtime/string.goc
@@ -32,19 +32,23 @@ runtime·findnullw(uint16 *s)
 	return l;
 }
 
-int32 runtime·maxstring = 256;
+uint32 runtime·maxstring = 256;
 
 String
 runtime·gostringsize(int32 l)
 {
 	String s;
+	uint32 ms;
 
 	if(l == 0)
 		return runtime·emptystring;
 	s.str = runtime·mal(l+1);	// leave room for NUL for C runtime (e.g., callers of getenv)
 	s.len = l;
-	if(l > runtime·maxstring)
-		runtime·maxstring = l;
+	for(;;) {
+		ms = runtime·maxstring;
+		if((uint32)l <= ms || runtime·cas(&runtime·maxstring, ms, (uint32)l))
+			break;
+	}
 	return s;
 }
 
@@ -56,7 +60,7 @@ runtime·gostring(byte *str)
 
 	l = runtime·findnull(str);
 	s = runtime·gostringsize(l);
-	runtime·mcpy(s.str, str, l);
+	runtime·memmove(s.str, str, l);
 	return s;
 }
 
@@ -66,10 +70,20 @@ runtime·gostringn(byte *str, int32 l)
 	String s;
 
 	s = runtime·gostringsize(l);
-	runtime·mcpy(s.str, str, l);
+	runtime·memmove(s.str, str, l);
 	return s;
 }
 
+Slice
+runtime·gobytes(byte *p, int32 n)
+{
+	Slice sl;
+
+	sl.array = runtime·mallocgc(n, FlagNoPointers, 1, 0);
+	runtime·memmove(sl.array, p, n);
+	return sl;
+}
+
 String
 runtime·gostringnocopy(byte *str)
 {
@@ -109,8 +123,8 @@ runtime·catstring(String s1, String s2)
 		return s1;
 
 	s3 = runtime·gostringsize(s1.len + s2.len);
-	runtime·mcpy(s3.str, s1.str, s1.len);
-	runtime·mcpy(s3.str+s1.len, s2.str, s2.len);
+	runtime·memmove(s3.str, s1.str, s1.len);
+	runtime·memmove(s3.str+s1.len, s2.str, s2.len);
 	return s3;
 }
 
@@ -130,7 +144,7 @@ concatstring(int32 n, String *s)
 	out = runtime·gostringsize(l);
 	l = 0;
 	for(i=0; i<n; i++) {
-		runtime·mcpy(out.str+l, s[i].str, s[i].len);
+		runtime·memmove(out.str+l, s[i].str, s[i].len);
 		l += s[i].len;
 	}
 	return out;
@@ -189,6 +203,28 @@ runtime·strcmp(byte *s1, byte *s2)
 	}
 }
 
+byte*
+runtime·strstr(byte *s1, byte *s2)
+{
+	byte *sp1, *sp2;
+
+	if(*s2 == 0)
+		return s1;
+	for(; *s1; s1++) {
+		if(*s1 != *s2)
+			continue;
+		sp1 = s1;
+		sp2 = s2;
+		for(;;) {
+			if(*sp2 == 0)
+				return s1;
+			if(*sp1++ != *sp2++)
+				break;
+		}
+	}
+	return nil;
+}
+
 func slicestring(si String, lindex int32, hindex int32) (so String) {
 	int32 l;
 
@@ -221,14 +257,14 @@ func intstring(v int64) (s String) {
 
 func slicebytetostring(b Slice) (s String) {
 	s = runtime·gostringsize(b.len);
-	runtime·mcpy(s.str, b.array, s.len);
+	runtime·memmove(s.str, b.array, s.len);
 }
 
 func stringtoslicebyte(s String) (b Slice) {
 	b.array = runtime·mallocgc(s.len, FlagNoPointers, 1, 1);
 	b.len = s.len;
 	b.cap = s.len;
-	runtime·mcpy(b.array, s.str, s.len);
+	runtime·memmove(b.array, s.str, s.len);
 }
 
 func sliceinttostring(b Slice) (s String) {
diff --git a/src/pkg/runtime/symtab.c b/src/pkg/runtime/symtab.c
index da4579734..d2ebf9b40 100644
--- a/src/pkg/runtime/symtab.c
+++ b/src/pkg/runtime/symtab.c
@@ -78,6 +78,7 @@ static int32 nfunc;
 static byte **fname;
 static int32 nfname;
 
+static uint32 funcinit;
 static Lock funclock;
 
 static void
@@ -159,7 +160,7 @@ makepath(byte *buf, int32 nbuf, byte *path)
 			break;
 		if(p > buf && p[-1] != '/')
 			*p++ = '/';
-		runtime·mcpy(p, q, len+1);
+		runtime·memmove(p, q, len+1);
 		p += len;
 	}
 }
@@ -420,10 +421,21 @@ runtime·findfunc(uintptr addr)
 	Func *f;
 	int32 nf, n;
 
-	runtime·lock(&funclock);
-	if(func == nil)
-		buildfuncs();
-	runtime·unlock(&funclock);
+	// Use atomic double-checked locking,
+	// because when called from pprof signal
+	// handler, findfunc must run without
+	// grabbing any locks.
+	// (Before enabling the signal handler,
+	// SetCPUProfileRate calls findfunc to trigger
+	// the initialization outside the handler.)
+	if(runtime·atomicload(&funcinit) == 0) {
+		runtime·lock(&funclock);
+		if(funcinit == 0) {
+			buildfuncs();
+			runtime·atomicstore(&funcinit, 1);
+		}
+		runtime·unlock(&funclock);
+	}
 
 	if(nfunc == 0)
 		return nil;
diff --git a/src/pkg/runtime/symtab_test.go b/src/pkg/runtime/symtab_test.go
new file mode 100644
index 000000000..bd9fe18c4
--- /dev/null
+++ b/src/pkg/runtime/symtab_test.go
@@ -0,0 +1,47 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func TestCaller(t *testing.T) {
+	procs := runtime.GOMAXPROCS(-1)
+	c := make(chan bool, procs)
+	for p := 0; p < procs; p++ {
+		go func() {
+			for i := 0; i < 1000; i++ {
+				testCallerFoo(t)
+			}
+			c <- true
+		}()
+		defer func() {
+			<-c
+		}()
+	}
+}
+
+func testCallerFoo(t *testing.T) {
+	testCallerBar(t)
+}
+
+func testCallerBar(t *testing.T) {
+	for i := 0; i < 2; i++ {
+		pc, file, line, ok := runtime.Caller(i)
+		f := runtime.FuncForPC(pc)
+		if !ok ||
+			!strings.HasSuffix(file, "symtab_test.go") ||
+			(i == 0 && !strings.HasSuffix(f.Name(), "testCallerBar")) ||
+			(i == 1 && !strings.HasSuffix(f.Name(), "testCallerFoo")) ||
+			line < 5 || line > 1000 ||
+			f.Entry() >= pc {
+			t.Errorf("incorrect symbol info %d: %t %d %d %s %s %d",
+				i, ok, f.Entry(), pc, f.Name(), file, line)
+		}
+	}
+}
diff --git a/src/pkg/runtime/windows/amd64/rt0.s b/src/pkg/runtime/windows/amd64/rt0.s
index e54e7edeb..35978bc74 100644
--- a/src/pkg/runtime/windows/amd64/rt0.s
+++ b/src/pkg/runtime/windows/amd64/rt0.s
@@ -8,3 +8,6 @@ TEXT	_rt0_amd64_windows(SB),7,$-8
 	MOVQ	$_rt0_amd64(SB), AX
 	MOVQ	SP, DI
 	JMP	AX
+
+DATA  runtime·iswindows(SB)/4, $1
+GLOBL runtime·iswindows(SB), $4
diff --git a/src/pkg/runtime/windows/amd64/sys.s b/src/pkg/runtime/windows/amd64/sys.s
index b1eacfc82..2009d164e 100644
--- a/src/pkg/runtime/windows/amd64/sys.s
+++ b/src/pkg/runtime/windows/amd64/sys.s
@@ -20,6 +20,7 @@ TEXT runtime·stdcall_raw(SB),7,$8
 	CMPQ	g(DI), SI
 	JEQ 3(PC)
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
+	ANDQ	$~15, SP
 	MOVQ	SI, g(DI)
 	
 	SUBQ	$0x60, SP