11 files changed, 1389 insertions, 1036 deletions
diff --git a/src/cmd/6g/cgen.c b/src/cmd/6g/cgen.c
index 00334e71b..a51c0ca58 100644
--- a/src/cmd/6g/cgen.c
+++ b/src/cmd/6g/cgen.c
@@ -33,9 +33,26 @@ cgen(Node *n, Node *res)
 	while(n->op == OCONVNOP)
 		n = n->left;
 
-	// inline slices
-	if(cgen_inline(n, res))
+	switch(n->op) {
+	case OSLICE:
+	case OSLICEARR:
+	case OSLICESTR:
+		if (res->op != ONAME || !res->addable) {
+			tempname(&n1, n->type);
+			cgen_slice(n, &n1);
+			cgen(&n1, res);
+		} else
+			cgen_slice(n, res);
 		goto ret;
+	case OEFACE:
+		if (res->op != ONAME || !res->addable) {
+			tempname(&n1, n->type);
+			cgen_eface(n, &n1);
+			cgen(&n1, res);
+		} else
+			cgen_eface(n, res);
+		goto ret;
+	}
 
 	if(n->ullman >= UINF) {
 		if(n->op == OINDREG)
@@ -174,7 +191,7 @@ cgen(Node *n, Node *res)
 	switch(n->op) {
 	default:
 		dump("cgen", n);
-		fatal("cgen: unknown op %N", n);
+		fatal("cgen: unknown op %+hN", n);
 		break;
 
 	// these call bgen to get a bool value
@@ -187,12 +204,12 @@ cgen(Node *n, Node *res)
 	case OGE:
 	case OGT:
 	case ONOT:
-		p1 = gbranch(AJMP, T);
+		p1 = gbranch(AJMP, T, 0);
 		p2 = pc;
 		gmove(nodbool(1), res);
-		p3 = gbranch(AJMP, T);
+		p3 = gbranch(AJMP, T, 0);
 		patch(p1, pc);
-		bgen(n, 1, p2);
+		bgen(n, 1, 0, p2);
 		gmove(nodbool(0), res);
 		patch(p3, pc);
 		goto ret;
@@ -229,17 +246,41 @@ cgen(Node *n, Node *res)
 	case OADD:
 	case OMUL:
 		a = optoas(n->op, nl->type);
-		if(a != AIMULB)
-			goto sbop;
-		cgen_bmul(n->op, nl, nr, res);
-		break;
+		if(a == AIMULB) {
+			cgen_bmul(n->op, nl, nr, res);
+			break;
+		}
+		goto sbop;
 
 	// asymmetric binary
 	case OSUB:
 		a = optoas(n->op, nl->type);
 		goto abop;
 
+	case OHMUL:
+		cgen_hmul(nl, nr, res);
+		break;
+
 	case OCONV:
+		if(n->type->width > nl->type->width) {
+			// If loading from memory, do conversion during load,
+			// so as to avoid use of 8-bit register in, say, int(*byteptr).
+			switch(nl->op) {
+			case ODOT:
+			case ODOTPTR:
+			case OINDEX:
+			case OIND:
+			case ONAME:
+				igen(nl, &n1, res);
+				regalloc(&n2, n->type, res);
+				gmove(&n1, &n2);
+				gmove(&n2, res);
+				regfree(&n2);
+				regfree(&n1);
+				goto ret;
+			}
+		}
+
 		regalloc(&n1, nl->type, res);
 		regalloc(&n2, n->type, &n1);
 		cgen(nl, &n1);
@@ -273,18 +314,18 @@ cgen(Node *n, Node *res)
 
 	case OLEN:
 		if(istype(nl->type, TMAP) || istype(nl->type, TCHAN)) {
-			// map and chan have len in the first 32-bit word.
+			// map and chan have len in the first int-sized word.
 			// a zero pointer means zero length
 			regalloc(&n1, types[tptr], res);
 			cgen(nl, &n1);
 
 			nodconst(&n2, types[tptr], 0);
 			gins(optoas(OCMP, types[tptr]), &n1, &n2);
-			p1 = gbranch(optoas(OEQ, types[tptr]), T);
+			p1 = gbranch(optoas(OEQ, types[tptr]), T, 0);
 
 			n2 = n1;
 			n2.op = OINDREG;
-			n2.type = types[TINT32];
+			n2.type = types[simtype[TINT]];
 			gmove(&n2, &n1);
 
 			patch(p1, pc);
@@ -297,7 +338,7 @@ cgen(Node *n, Node *res)
 			// both slice and string have len one pointer into the struct.
 			// a zero pointer means zero length
 			igen(nl, &n1, res);
-			n1.type = types[TUINT32];
+			n1.type = types[simtype[TUINT]];
 			n1.xoffset += Array_nel;
 			gmove(&n1, res);
 			regfree(&n1);
@@ -308,19 +349,19 @@ cgen(Node *n, Node *res)
 
 	case OCAP:
 		if(istype(nl->type, TCHAN)) {
-			// chan has cap in the second 32-bit word.
+			// chan has cap in the second int-sized word.
 			// a zero pointer means zero length
 			regalloc(&n1, types[tptr], res);
 			cgen(nl, &n1);
 
 			nodconst(&n2, types[tptr], 0);
 			gins(optoas(OCMP, types[tptr]), &n1, &n2);
-			p1 = gbranch(optoas(OEQ, types[tptr]), T);
+			p1 = gbranch(optoas(OEQ, types[tptr]), T, 0);
 
 			n2 = n1;
 			n2.op = OINDREG;
-			n2.xoffset = 4;
-			n2.type = types[TINT32];
+			n2.xoffset = widthint;
+			n2.type = types[simtype[TINT]];
 			gmove(&n2, &n1);
 
 			patch(p1, pc);
@@ -331,7 +372,7 @@ cgen(Node *n, Node *res)
 		}
 		if(isslice(nl->type)) {
 			igen(nl, &n1, res);
-			n1.type = types[TUINT32];
+			n1.type = types[simtype[TUINT]];
 			n1.xoffset += Array_cap;
 			gmove(&n1, res);
 			regfree(&n1);
@@ -365,18 +406,53 @@ cgen(Node *n, Node *res)
 			a = optoas(n->op, nl->type);
 			goto abop;
 		}
-		cgen_div(n->op, nl, nr, res);
+
+		if(nl->ullman >= nr->ullman) {
+			regalloc(&n1, nl->type, res);
+			cgen(nl, &n1);
+			cgen_div(n->op, &n1, nr, res);
+			regfree(&n1);
+		} else {
+			if(!smallintconst(nr)) {
+				regalloc(&n2, nr->type, res);
+				cgen(nr, &n2);
+			} else {
+				n2 = *nr;
+			}
+			cgen_div(n->op, nl, &n2, res);
+			if(n2.op != OLITERAL)
+				regfree(&n2);
+		}
 		break;
 
 	case OLSH:
 	case ORSH:
-		cgen_shift(n->op, nl, nr, res);
+	case OLROT:
+		cgen_shift(n->op, n->bounded, nl, nr, res);
 		break;
 	}
 	goto ret;
 
 sbop:	// symmetric binary
-	if(nl->ullman < nr->ullman) {
+	/*
+	 * put simplest on right - we'll generate into left
+	 * and then adjust it using the computation of right.
+	 * constants and variables have the same ullman
+	 * count, so look for constants specially.
+	 *
+	 * an integer constant we can use as an immediate
+	 * is simpler than a variable - we can use the immediate
+	 * in the adjustment instruction directly - so it goes
+	 * on the right.
+	 *
+	 * other constants, like big integers or floating point
+	 * constants, require a mov into a register, so those
+	 * might as well go on the left, so we can reuse that
+	 * register for the computation.
+	 */
+	if(nl->ullman < nr->ullman ||
+	   (nl->ullman == nr->ullman &&
+	    (smallintconst(nl) || (nr->op == OLITERAL && !smallintconst(nr))))) {
 		r = nl;
 		nl = nr;
 		nr = r;
@@ -386,7 +462,13 @@ abop:	// asymmetric binary
 	if(nl->ullman >= nr->ullman) {
 		regalloc(&n1, nl->type, res);
 		cgen(nl, &n1);
-
+	/*
+	 * This generates smaller code - it avoids a MOV - but it's
+	 * easily 10% slower due to not being able to
+	 * optimize/manipulate the move.
+	 * To see, run: go test -bench . crypto/md5
+	 * with and without.
+	 *
 		if(sudoaddable(a, nr, &addr)) {
 			p1 = gins(a, N, &n1);
 			p1->from = addr;
@@ -395,18 +477,30 @@ abop:	// asymmetric binary
 			regfree(&n1);
 			goto ret;
 		}
-		regalloc(&n2, nr->type, N);
-		cgen(nr, &n2);
+	 *
+	 */
+
+		if(smallintconst(nr))
+			n2 = *nr;
+		else {
+			regalloc(&n2, nr->type, N);
+			cgen(nr, &n2);
+		}
 	} else {
-		regalloc(&n2, nr->type, res);
-		cgen(nr, &n2);
+		if(smallintconst(nr))
+			n2 = *nr;
+		else {
+			regalloc(&n2, nr->type, res);
+			cgen(nr, &n2);
+		}
 		regalloc(&n1, nl->type, N);
 		cgen(nl, &n1);
 	}
 	gins(a, &n2, &n1);
 	gmove(&n1, res);
 	regfree(&n1);
-	regfree(&n2);
+	if(n2.op != OLITERAL)
+		regfree(&n2);
 	goto ret;
 
 uop:	// unary
@@ -422,93 +516,142 @@ ret:
 }
 
 /*
- * generate:
- *	res = &n;
+ * allocate a register in res and generate
+ *  newreg = &n
+ * The caller must call regfree(a).
  */
 void
-agen(Node *n, Node *res)
+cgenr(Node *n, Node *a, Node *res)
+{
+	Node n1;
+
+	if(debug['g'])
+		dump("cgenr-n", n);
+
+	if(isfat(n->type))
+		fatal("cgenr on fat node");
+
+	if(n->addable) {
+		regalloc(a, n->type, res);
+		gmove(n, a);
+		return;
+	}
+
+	switch(n->op) {
+	case ONAME:
+	case ODOT:
+	case ODOTPTR:
+	case OINDEX:
+	case OCALLFUNC:
+	case OCALLMETH:
+	case OCALLINTER:
+		igen(n, &n1, res);
+		regalloc(a, types[tptr], &n1);
+		gmove(&n1, a);
+		regfree(&n1);
+		break;
+	default:
+		regalloc(a, n->type, res);
+		cgen(n, a);
+		break;
+	}
+}
+
+/*
+ * allocate a register in res and generate
+ * res = &n
+ */
+void
+agenr(Node *n, Node *a, Node *res)
 {
 	Node *nl, *nr;
-	Node n1, n2, n3, tmp, n4, n5;
+	Node n1, n2, n3, n4, n5, tmp, tmp2, nlen;
 	Prog *p1;
+	Type *t;
 	uint32 w;
 	uint64 v;
-	Type *t;
+	int freelen;
 
 	if(debug['g']) {
-		dump("\nagen-res", res);
-		dump("agen-r", n);
-	}
-	if(n == N || n->type == T)
-		return;
-
-	while(n->op == OCONVNOP)
-		n = n->left;
-
-	if(n->addable) {
-		regalloc(&n1, types[tptr], res);
-		gins(ALEAQ, n, &n1);
-		gmove(&n1, res);
-		regfree(&n1);
-		goto ret;
+		dump("\nagenr-n", n);
 	}
 
 	nl = n->left;
 	nr = n->right;
 
 	switch(n->op) {
-	default:
-		fatal("agen: unknown op %N", n);
-		break;
-
+	case ODOT:
+	case ODOTPTR:
+	case OCALLFUNC:
 	case OCALLMETH:
-		cgen_callmeth(n, 0);
-		cgen_aret(n, res);
-		break;
-
 	case OCALLINTER:
-		cgen_callinter(n, res, 0);
-		cgen_aret(n, res);
+		igen(n, &n1, res);
+		regalloc(a, types[tptr], &n1);
+		agen(&n1, a);
+		regfree(&n1);
 		break;
 
-	case OCALLFUNC:
-		cgen_call(n, 0);
-		cgen_aret(n, res);
+	case OIND:
+		cgenr(n->left, a, res);
 		break;
 
 	case OINDEX:
+		freelen = 0;
 		w = n->type->width;
+		// Generate the non-addressable child first.
 		if(nr->addable)
 			goto irad;
 		if(nl->addable) {
-			if(!isconst(nr, CTINT)) {
-				regalloc(&n1, nr->type, N);
-				cgen(nr, &n1);
-			}
+			cgenr(nr, &n1, N);
 			if(!isconst(nl, CTSTR)) {
-				regalloc(&n3, types[tptr], res);
-				agen(nl, &n3);
+				if(isfixedarray(nl->type)) {
+					agenr(nl, &n3, res);
+				} else {
+					igen(nl, &nlen, res);
+					freelen = 1;
+					nlen.type = types[tptr];
+					nlen.xoffset += Array_array;
+					regalloc(&n3, types[tptr], res);
+					gmove(&nlen, &n3);
+					nlen.type = types[simtype[TUINT]];
+					nlen.xoffset += Array_nel-Array_array;
+				}
 			}
 			goto index;
 		}
 		tempname(&tmp, nr->type);
 		cgen(nr, &tmp);
 		nr = &tmp;
-
 	irad:
 		if(!isconst(nl, CTSTR)) {
-			regalloc(&n3, types[tptr], res);
-			agen(nl, &n3);
+			if(isfixedarray(nl->type)) {
+				agenr(nl, &n3, res);
+			} else {
+				if(!nl->addable) {
+					// igen will need an addressable node.
+					tempname(&tmp2, nl->type);
+					cgen(nl, &tmp2);
+					nl = &tmp2;
+				}
+				igen(nl, &nlen, res);
+				freelen = 1;
+				nlen.type = types[tptr];
+				nlen.xoffset += Array_array;
+				regalloc(&n3, types[tptr], res);
+				gmove(&nlen, &n3);
+				nlen.type = types[simtype[TUINT]];
+				nlen.xoffset += Array_nel-Array_array;
+			}
 		}
 		if(!isconst(nr, CTINT)) {
-			regalloc(&n1, nr->type, N);
-			cgen(nr, &n1);
+			cgenr(nr, &n1, N);
 		}
 		goto index;
 
 	index:
 		// &a is in &n3 (allocated in res)
 		// i is in &n1 (if not constant)
+		// len(a) is in nlen (if needed)
 		// w is width
 
 		// explicit check for nil if array is large enough
@@ -529,29 +672,26 @@ agen(Node *n, Node *res)
 				fatal("constant string constant index");	// front end should handle
 			v = mpgetfix(nr->val.u.xval);
 			if(isslice(nl->type) || nl->type->etype == TSTRING) {
-				if(!debug['B'] && !n->etype) {
-					n1 = n3;
-					n1.op = OINDREG;
-					n1.type = types[tptr];
-					n1.xoffset = Array_nel;
-					nodconst(&n2, types[TUINT32], v);
-					gins(optoas(OCMP, types[TUINT32]), &n1, &n2);
-					p1 = gbranch(optoas(OGT, types[TUINT32]), T);
-					ginscall(panicindex, 0);
+				if(!debug['B'] && !n->bounded) {
+					nodconst(&n2, types[simtype[TUINT]], v);
+					if(smallintconst(nr)) {
+						gins(optoas(OCMP, types[simtype[TUINT]]), &nlen, &n2);
+					} else {
+						regalloc(&tmp, types[simtype[TUINT]], N);
+						gmove(&n2, &tmp);
+						gins(optoas(OCMP, types[simtype[TUINT]]), &nlen, &tmp);
+						regfree(&tmp);
+					}
+					p1 = gbranch(optoas(OGT, types[simtype[TUINT]]), T, +1);
+					ginscall(panicindex, -1);
 					patch(p1, pc);
 				}
-
-				n1 = n3;
-				n1.op = OINDREG;
-				n1.type = types[tptr];
-				n1.xoffset = Array_array;
-				gmove(&n1, &n3);
+				regfree(&nlen);
 			}
 
 			if (v*w != 0)
 				ginscon(optoas(OADD, types[tptr]), v*w, &n3);
-			gmove(&n3, res);
-			regfree(&n3);
+			*a = n3;
 			break;
 		}
 
@@ -564,32 +704,32 @@ agen(Node *n, Node *res)
 		gmove(&n1, &n2);
 		regfree(&n1);
 
-		if(!debug['B'] && !n->etype) {
+		if(!debug['B'] && !n->bounded) {
 			// check bounds
-			n5.op = OXXX;
-			t = types[TUINT32];
+			t = types[simtype[TUINT]];
 			if(is64(nr->type))
 				t = types[TUINT64];
 			if(isconst(nl, CTSTR)) {
-				nodconst(&n1, t, nl->val.u.sval->len);
+				nodconst(&nlen, t, nl->val.u.sval->len);
 			} else if(isslice(nl->type) || nl->type->etype == TSTRING) {
-				n1 = n3;
-				n1.op = OINDREG;
-				n1.type = types[TUINT32];
-				n1.xoffset = Array_nel;
 				if(is64(nr->type)) {
 					regalloc(&n5, t, N);
-					gmove(&n1, &n5);
-					n1 = n5;
+					gmove(&nlen, &n5);
+					regfree(&nlen);
+					nlen = n5;
 				}
 			} else {
-				nodconst(&n1, t, nl->type->bound);
+				nodconst(&nlen, t, nl->type->bound);
+				if(!smallintconst(&nlen)) {
+					regalloc(&n5, t, N);
+					gmove(&nlen, &n5);
+					nlen = n5;
+					freelen = 1;
+				}
 			}
-			gins(optoas(OCMP, t), &n2, &n1);
-			p1 = gbranch(optoas(OLT, t), T);
-			if(n5.op != OXXX)
-				regfree(&n5);
-			ginscall(panicindex, 0);
+			gins(optoas(OCMP, t), &n2, &nlen);
+			p1 = gbranch(optoas(OLT, t), T, +1);
+			ginscall(panicindex, -1);
 			patch(p1, pc);
 		}
 
@@ -597,19 +737,15 @@ agen(Node *n, Node *res)
 			regalloc(&n3, types[tptr], res);
 			p1 = gins(ALEAQ, N, &n3);
 			datastring(nl->val.u.sval->s, nl->val.u.sval->len, &p1->from);
-			p1->from.scale = 1;
-			p1->from.index = n2.val.u.reg;
+			if(flag_largemodel) {
+				gins(AADDQ, &n2, &n3);
+			} else {
+				p1->from.scale = 1;
+				p1->from.index = n2.val.u.reg;
+			}
 			goto indexdone;
 		}
 
-		if(isslice(nl->type) || nl->type->etype == TSTRING) {
-			n1 = n3;
-			n1.op = OINDREG;
-			n1.type = types[tptr];
-			n1.xoffset = Array_array;
-			gmove(&n1, &n3);
-		}
-
 		if(w == 0) {
 			// nothing to do
 		} else if(w == 1 || w == 2 || w == 4 || w == 8) {
@@ -623,9 +759,103 @@ agen(Node *n, Node *res)
 		}
 
 	indexdone:
-		gmove(&n3, res);
+		*a = n3;
 		regfree(&n2);
-		regfree(&n3);
+		if(freelen)
+			regfree(&nlen);
+		break;
+
+	default:
+		regalloc(a, types[tptr], res);
+		agen(n, a);
+		break;
+	}
+}
+
+/*
+ * generate:
+ *	res = &n;
+ */
+void
+agen(Node *n, Node *res)
+{
+	Node *nl, *nr;
+	Node n1, n2;
+
+	if(debug['g']) {
+		dump("\nagen-res", res);
+		dump("agen-r", n);
+	}
+	if(n == N || n->type == T)
+		return;
+
+	while(n->op == OCONVNOP)
+		n = n->left;
+
+	if(isconst(n, CTNIL) && n->type->width > widthptr) {
+		// Use of a nil interface or nil slice.
+		// Create a temporary we can take the address of and read.
+		// The generated code is just going to panic, so it need not
+		// be terribly efficient. See issue 3670.
+		tempname(&n1, n->type);
+		clearfat(&n1);
+		regalloc(&n2, types[tptr], res);
+		gins(ALEAQ, &n1, &n2);
+		gmove(&n2, res);
+		regfree(&n2);
+		goto ret;
+	}
+		
+	if(n->addable) {
+		regalloc(&n1, types[tptr], res);
+		gins(ALEAQ, n, &n1);
+		gmove(&n1, res);
+		regfree(&n1);
+		goto ret;
+	}
+
+	nl = n->left;
+	nr = n->right;
+	USED(nr);
+
+	switch(n->op) {
+	default:
+		fatal("agen: unknown op %+hN", n);
+		break;
+
+	case OCALLMETH:
+		cgen_callmeth(n, 0);
+		cgen_aret(n, res);
+		break;
+
+	case OCALLINTER:
+		cgen_callinter(n, res, 0);
+		cgen_aret(n, res);
+		break;
+
+	case OCALLFUNC:
+		cgen_call(n, 0);
+		cgen_aret(n, res);
+		break;
+
+	case OSLICE:
+	case OSLICEARR:
+	case OSLICESTR:
+		tempname(&n1, n->type);
+		cgen_slice(n, &n1);
+		agen(&n1, res);
+		break;
+
+	case OEFACE:
+		tempname(&n1, n->type);
+		cgen_eface(n, &n1);
+		agen(&n1, res);
+		break;
+
+	case OINDEX:
+		agenr(n, &n1, res);
+		gmove(&n1, res);
+		regfree(&n1);
 		break;
 
 	case ONAME:
@@ -692,7 +922,11 @@ igen(Node *n, Node *a, Node *res)
 {
 	Type *fp;
 	Iter flist;
- 
+	Node n1;
+
+	if(debug['g']) {
+		dump("\nigen-n", n);
+	}
 	switch(n->op) {
 	case ONAME:
 		if((n->class&PHEAP) || n->class == PPARAMREF)
@@ -700,9 +934,53 @@ igen(Node *n, Node *a, Node *res)
 		*a = *n;
 		return;
 
+	case OINDREG:
+		// Increase the refcount of the register so that igen's caller
+		// has to call regfree.
+		if(n->val.u.reg != D_SP)
+			reg[n->val.u.reg]++;
+		*a = *n;
+		return;
+
+	case ODOT:
+		igen(n->left, a, res);
+		a->xoffset += n->xoffset;
+		a->type = n->type;
+		return;
+
+	case ODOTPTR:
+		cgenr(n->left, a, res);
+		if(n->xoffset != 0) {
+			// explicit check for nil if struct is large enough
+			// that we might derive too big a pointer.
+			if(n->left->type->type->width >= unmappedzero) {
+				n1 = *a;
+				n1.op = OINDREG;
+				n1.type = types[TUINT8];
+				n1.xoffset = 0;
+				gins(ATESTB, nodintconst(0), &n1);
+			}
+		}
+		a->op = OINDREG;
+		a->xoffset += n->xoffset;
+		a->type = n->type;
+		return;
+
 	case OCALLFUNC:
+	case OCALLMETH:
+	case OCALLINTER:
+		switch(n->op) {
+		case OCALLFUNC:
+			cgen_call(n, 0);
+			break;
+		case OCALLMETH:
+			cgen_callmeth(n, 0);
+			break;
+		case OCALLINTER:
+			cgen_callinter(n, N, 0);
+			break;
+		}
 		fp = structfirst(&flist, getoutarg(n->left->type));
-		cgen_call(n, 0);
 		memset(a, 0, sizeof *a);
 		a->op = OINDREG;
 		a->val.u.reg = D_SP;
@@ -710,10 +988,34 @@ igen(Node *n, Node *a, Node *res)
 		a->xoffset = fp->width;
 		a->type = n->type;
 		return;
+
+	case OINDEX:
+		// Index of fixed-size array by constant can
+		// put the offset in the addressing.
+		// Could do the same for slice except that we need
+		// to use the real index for the bounds checking.
+		if(isfixedarray(n->left->type) ||
+		   (isptr[n->left->type->etype] && isfixedarray(n->left->left->type)))
+		if(isconst(n->right, CTINT)) {
+			// Compute &a.
+			if(!isptr[n->left->type->etype])
+				igen(n->left, a, res);
+			else {
+				igen(n->left, &n1, res);
+				regalloc(a, types[tptr], res);
+				gmove(&n1, a);
+				regfree(&n1);
+				a->op = OINDREG;
+			}
+
+			// Compute &a[i] as &a + i*width.
+			a->type = n->type;
+			a->xoffset += mpgetfix(n->right->val.u.xval)*n->type->width;
+			return;
+		}
 	}
- 
-	regalloc(a, types[tptr], res);
-	agen(n, a);
+
+	agenr(n, a, res);
 	a->op = OINDREG;
 	a->type = n->type;
 }
@@ -723,7 +1025,7 @@ igen(Node *n, Node *a, Node *res)
  *	if(n == true) goto to;
  */
 void
-bgen(Node *n, int true, Prog *to)
+bgen(Node *n, int true, int likely, Prog *to)
 {
 	int et, a;
 	Node *nl, *nr, *l, *r;
@@ -765,14 +1067,14 @@ bgen(Node *n, int true, Prog *to)
 		a = AJNE;
 		if(!true)
 			a = AJEQ;
-		patch(gbranch(a, n->type), to);
+		patch(gbranch(a, n->type, likely), to);
 		regfree(&n1);
 		goto ret;
 
 	case OLITERAL:
 		// need to ask if it is bool?
 		if(!true == !n->val.u.bval)
-			patch(gbranch(AJMP, T), to);
+			patch(gbranch(AJMP, T, likely), to);
 		goto ret;
 
 	case ONAME:
@@ -783,7 +1085,7 @@ bgen(Node *n, int true, Prog *to)
 		a = AJNE;
 		if(!true)
 			a = AJEQ;
-		patch(gbranch(a, n->type), to);
+		patch(gbranch(a, n->type, likely), to);
 		goto ret;
 
 	case OANDAND:
@@ -791,12 +1093,12 @@ bgen(Node *n, int true, Prog *to)
 			goto caseor;
 
 	caseand:
-		p1 = gbranch(AJMP, T);
-		p2 = gbranch(AJMP, T);
+		p1 = gbranch(AJMP, T, 0);
+		p2 = gbranch(AJMP, T, 0);
 		patch(p1, pc);
-		bgen(n->left, !true, p2);
-		bgen(n->right, !true, p2);
-		p1 = gbranch(AJMP, T);
+		bgen(n->left, !true, -likely, p2);
+		bgen(n->right, !true, -likely, p2);
+		p1 = gbranch(AJMP, T, 0);
 		patch(p1, to);
 		patch(p2, pc);
 		goto ret;
@@ -806,8 +1108,8 @@ bgen(Node *n, int true, Prog *to)
 			goto caseand;
 
 	caseor:
-		bgen(n->left, true, to);
-		bgen(n->right, true, to);
+		bgen(n->left, true, likely, to);
+		bgen(n->right, true, likely, to);
 		goto ret;
 
 	case OEQ:
@@ -830,7 +1132,7 @@ bgen(Node *n, int true, Prog *to)
 	switch(n->op) {
 
 	case ONOT:
-		bgen(nl, !true, to);
+		bgen(nl, !true, likely, to);
 		goto ret;
 
 	case OEQ:
@@ -843,14 +1145,14 @@ bgen(Node *n, int true, Prog *to)
 		if(!true) {
 			if(isfloat[nr->type->etype]) {
 				// brcom is not valid on floats when NaN is involved.
-				p1 = gbranch(AJMP, T);
-				p2 = gbranch(AJMP, T);
+				p1 = gbranch(AJMP, T, 0);
+				p2 = gbranch(AJMP, T, 0);
 				patch(p1, pc);
 				ll = n->ninit;   // avoid re-genning ninit
 				n->ninit = nil;
-				bgen(n, 1, p2);
+				bgen(n, 1, -likely, p2);
 				n->ninit = ll;
-				patch(gbranch(AJMP, T), to);
+				patch(gbranch(AJMP, T, 0), to);
 				patch(p2, pc);
 				goto ret;
 			}				
@@ -865,47 +1167,41 @@ bgen(Node *n, int true, Prog *to)
 			nl = nr;
 			nr = r;
 		}
-		
+
 		if(isslice(nl->type)) {
-			// only valid to cmp darray to literal nil
+			// front end should only leave cmp to literal nil
 			if((a != OEQ && a != ONE) || nr->op != OLITERAL) {
-				yyerror("illegal array comparison");
+				yyerror("illegal slice comparison");
 				break;
 			}
 			a = optoas(a, types[tptr]);
-			regalloc(&n1, types[tptr], N);
-			agen(nl, &n1);
-			n2 = n1;
-			n2.op = OINDREG;
-			n2.xoffset = Array_array;
-			n2.type = types[tptr];
+			igen(nl, &n1, N);
+			n1.xoffset += Array_array;
+			n1.type = types[tptr];
 			nodconst(&tmp, types[tptr], 0);
-			gins(optoas(OCMP, types[tptr]), &n2, &tmp);
-			patch(gbranch(a, types[tptr]), to);
+			gins(optoas(OCMP, types[tptr]), &n1, &tmp);
+			patch(gbranch(a, types[tptr], likely), to);
 			regfree(&n1);
 			break;
 		}
 
 		if(isinter(nl->type)) {
-			// front end shold only leave cmp to literal nil
+			// front end should only leave cmp to literal nil
 			if((a != OEQ && a != ONE) || nr->op != OLITERAL) {
 				yyerror("illegal interface comparison");
 				break;
 			}
 			a = optoas(a, types[tptr]);
-			regalloc(&n1, types[tptr], N);
-			agen(nl, &n1);
-			n2 = n1;
-			n2.op = OINDREG;
-			n2.xoffset = 0;
+			igen(nl, &n1, N);
+			n1.type = types[tptr];
 			nodconst(&tmp, types[tptr], 0);
-			gins(optoas(OCMP, types[tptr]), &n2, &tmp);
-			patch(gbranch(a, types[tptr]), to);
+			gins(optoas(OCMP, types[tptr]), &n1, &tmp);
+			patch(gbranch(a, types[tptr], likely), to);
 			regfree(&n1);
 			break;
 		}
 		if(iscomplex[nl->type->etype]) {
-			complexbool(a, nl, nr, true, to);
+			complexbool(a, nl, nr, true, likely, to);
 			break;
 		}
 
@@ -931,7 +1227,7 @@ bgen(Node *n, int true, Prog *to)
 
 		if(smallintconst(nr)) {
 			gins(optoas(OCMP, nr->type), &n1, nr);
-			patch(gbranch(optoas(a, nr->type), nr->type), to);
+			patch(gbranch(optoas(a, nr->type), nr->type, likely), to);
 			regfree(&n1);
 			break;
 		}
@@ -953,18 +1249,18 @@ bgen(Node *n, int true, Prog *to)
 		if(isfloat[nr->type->etype] && (n->op == OEQ || n->op == ONE)) {
 			if(n->op == OEQ) {
 				// neither NE nor P
-				p1 = gbranch(AJNE, T);
-				p2 = gbranch(AJPS, T);
-				patch(gbranch(AJMP, T), to);
+				p1 = gbranch(AJNE, T, -likely);
+				p2 = gbranch(AJPS, T, -likely);
+				patch(gbranch(AJMP, T, 0), to);
 				patch(p1, pc);
 				patch(p2, pc);
 			} else {
 				// either NE or P
-				patch(gbranch(AJNE, T), to);
-				patch(gbranch(AJPS, T), to);
+				patch(gbranch(AJNE, T, likely), to);
+				patch(gbranch(AJPS, T, likely), to);
 			}
 		} else
-			patch(gbranch(optoas(a, nr->type), nr->type), to);
+			patch(gbranch(optoas(a, nr->type), nr->type, likely), to);
 		regfree(&n1);
 		regfree(&n2);
 		break;
@@ -1036,8 +1332,8 @@ stkof(Node *n)
 void
 sgen(Node *n, Node *ns, int64 w)
 {
-	Node nodl, nodr, oldl, oldr, cx, oldcx, tmp;
-	int32 c, q, odst, osrc;
+	Node nodl, nodr, nodsi, noddi, cx, oldcx, tmp;
+	vlong c, q, odst, osrc;
 
 	if(debug['g']) {
 		print("\nsgen w=%lld\n", w);
@@ -1051,9 +1347,9 @@ sgen(Node *n, Node *ns, int64 w)
 	if(w < 0)
 		fatal("sgen copy %lld", w);
 
-	if(w == 16)
-		if(componentgen(n, ns))
-			return;
+	// Avoid taking the address for simple enough types.
+	if(componentgen(n, ns))
+		return;
 	
 	if(w == 0) {
 		// evaluate side effects only
@@ -1080,22 +1376,18 @@ sgen(Node *n, Node *ns, int64 w)
 	}
 
 	if(n->ullman >= ns->ullman) {
-		savex(D_SI, &nodr, &oldr, N, types[tptr]);
-		agen(n, &nodr);
-
-		regalloc(&nodr, types[tptr], &nodr);	// mark nodr as live
-		savex(D_DI, &nodl, &oldl, N, types[tptr]);
-		agen(ns, &nodl);
-		regfree(&nodr);
+		agenr(n, &nodr, N);
+		agenr(ns, &nodl, N);
 	} else {
-		savex(D_DI, &nodl, &oldl, N, types[tptr]);
-		agen(ns, &nodl);
-
-		regalloc(&nodl, types[tptr], &nodl);	// mark nodl as live
-		savex(D_SI, &nodr, &oldr, N, types[tptr]);
-		agen(n, &nodr);
-		regfree(&nodl);
+		agenr(ns, &nodl, N);
+		agenr(n, &nodr, N);
 	}
+	nodreg(&noddi, types[tptr], D_DI);
+	nodreg(&nodsi, types[tptr], D_SI);
+	gmove(&nodl, &noddi);
+	gmove(&nodr, &nodsi);
+	regfree(&nodl);
+	regfree(&nodr);
 
 	c = w % 8;	// bytes
 	q = w / 8;	// quads
@@ -1152,9 +1444,6 @@ sgen(Node *n, Node *ns, int64 w)
 		}
 	}
 
-
-	restx(&nodl, &oldl);
-	restx(&nodr, &oldr);
 	restx(&cx, &oldcx);
 }
 
@@ -1175,15 +1464,21 @@ cadable(Node *n)
 }
 
 /*
- * copy a structure component by component
+ * copy a composite value by moving its individual components.
+ * Slices, strings and interfaces are supported.
+ * Small structs or arrays with elements of basic type are
+ * also supported.
+ * nr is N when assigning a zero value.
  * return 1 if can do, 0 if cant.
- * nr is N for copy zero
  */
 int
 componentgen(Node *nr, Node *nl)
 {
 	Node nodl, nodr;
+	Type *t;
 	int freel, freer;
+	vlong fldcount;
+	vlong loffset, roffset;
 
 	freel = 0;
 	freer = 0;
@@ -1193,8 +1488,33 @@ componentgen(Node *nr, Node *nl)
 		goto no;
 
 	case TARRAY:
-		if(!isslice(nl->type))
+		t = nl->type;
+
+		// Slices are ok.
+		if(isslice(t))
+			break;
+		// Small arrays are ok.
+		if(t->bound > 0 && t->bound <= 3 && !isfat(t->type))
+			break;
+
+		goto no;
+
+	case TSTRUCT:
+		// Small structs with non-fat types are ok.
+		// Zero-sized structs are treated separately elsewhere.
+		fldcount = 0;
+		for(t=nl->type->type; t; t=t->down) {
+			if(isfat(t->type))
+				goto no;
+			if(t->etype != TFIELD)
+				fatal("componentgen: not a TFIELD: %lT", t);
+			fldcount++;
+		}
+		if(fldcount == 0 || fldcount > 3)
 			goto no;
+
+		break;
+
 	case TSTRING:
 	case TINTER:
 		break;
@@ -1218,9 +1538,23 @@ componentgen(Node *nr, Node *nl)
 
 	switch(nl->type->etype) {
 	case TARRAY:
-		if(!isslice(nl->type))
-			goto no;
+		// componentgen for arrays.
+		t = nl->type;
+		if(!isslice(t)) {
+			nodl.type = t->type;
+			nodr.type = nodl.type;
+			for(fldcount=0; fldcount < t->bound; fldcount++) {
+				if(nr == N)
+					clearslim(&nodl);
+				else
+					gmove(&nodr, &nodl);
+				nodl.xoffset += t->type->width;
+				nodr.xoffset += t->type->width;
+			}
+			goto yes;
+		}
 
+		// componentgen for slices.
 		nodl.xoffset += Array_array;
 		nodl.type = ptrto(nl->type->type);
 
@@ -1232,7 +1566,7 @@ componentgen(Node *nr, Node *nl)
 		gmove(&nodr, &nodl);
 
 		nodl.xoffset += Array_nel-Array_array;
-		nodl.type = types[TUINT32];
+		nodl.type = types[simtype[TUINT]];
 
 		if(nr != N) {
 			nodr.xoffset += Array_nel-Array_array;
@@ -1242,7 +1576,7 @@ componentgen(Node *nr, Node *nl)
 		gmove(&nodr, &nodl);
 
 		nodl.xoffset += Array_cap-Array_nel;
-		nodl.type = types[TUINT32];
+		nodl.type = types[simtype[TUINT]];
 
 		if(nr != N) {
 			nodr.xoffset += Array_cap-Array_nel;
@@ -1265,7 +1599,7 @@ componentgen(Node *nr, Node *nl)
 		gmove(&nodr, &nodl);
 
 		nodl.xoffset += Array_nel-Array_array;
-		nodl.type = types[TUINT32];
+		nodl.type = types[simtype[TUINT]];
 
 		if(nr != N) {
 			nodr.xoffset += Array_nel-Array_array;
@@ -1300,7 +1634,27 @@ componentgen(Node *nr, Node *nl)
 		goto yes;
 
 	case TSTRUCT:
-		goto no;
+		loffset = nodl.xoffset;
+		roffset = nodr.xoffset;
+		// funarg structs may not begin at offset zero.
+		if(nl->type->etype == TSTRUCT && nl->type->funarg && nl->type->type)
+			loffset -= nl->type->type->width;
+		if(nr != N && nr->type->etype == TSTRUCT && nr->type->funarg && nr->type->type)
+			roffset -= nr->type->type->width;
+
+		for(t=nl->type->type; t; t=t->down) {
+			nodl.xoffset = loffset + t->width;
+			nodl.type = t->type;
+
+			if(nr == N)
+				clearslim(&nodl);
+			else {
+				nodr.xoffset = roffset + t->width;
+				nodr.type = nodl.type;
+				gmove(&nodr, &nodl);
+			}
+		}
+		goto yes;
 	}
 
 no:
diff --git a/src/cmd/6g/doc.go b/src/cmd/6g/doc.go
index 64f1d2ba9..07b2818da 100644
--- a/src/cmd/6g/doc.go
+++ b/src/cmd/6g/doc.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build ignore
+
 /*
 
 6g is the version of the gc compiler for the x86-64.
@@ -10,4 +12,4 @@ The $GOARCH for these tools is amd64.
 It reads .go files and outputs .6 files. The flags are documented in ../gc/doc.go.
 
 */
-package documentation
+package main
diff --git a/src/cmd/6g/galign.c b/src/cmd/6g/galign.c
index b03ac1ed6..526c04c06 100644
--- a/src/cmd/6g/galign.c
+++ b/src/cmd/6g/galign.c
@@ -17,8 +17,8 @@ vlong MAXWIDTH = 1LL<<50;
  */
 Typedef	typedefs[] =
 {
-	"int",		TINT,		TINT32,
-	"uint",		TUINT,		TUINT32,
+	"int",		TINT,		TINT64,
+	"uint",		TUINT,		TUINT64,
 	"uintptr",	TUINTPTR,	TUINT64,
 	0
 };
@@ -27,6 +27,7 @@ void
 betypeinit(void)
 {
 	widthptr = 8;
+	widthint = 8;
 
 	zprog.link = P;
 	zprog.as = AGOK;
diff --git a/src/cmd/6g/gg.h b/src/cmd/6g/gg.h
index 47a540082..ceb6a2caa 100644
--- a/src/cmd/6g/gg.h
+++ b/src/cmd/6g/gg.h
@@ -14,19 +14,22 @@ typedef	struct	Addr	Addr;
 struct	Addr
 {
 	vlong	offset;
-	double	dval;
-	Prog*	branch;
-	char	sval[NSNAME];
+	
+	union {
+		double	dval;
+		vlong	vval;
+		Prog*	branch;
+		char	sval[NSNAME];
+	} u;
 
 	Sym*	gotype;
 	Sym*	sym;
 	Node*	node;
-	int	width;
+	int64	width;
 	uchar	type;
 	uchar	index;
 	uchar	etype;
 	uchar	scale;	/* doubles as width in DATA op */
-	uchar	pun;	/* dont register variable */
 };
 #define	A	((Addr*)0)
 
@@ -58,7 +61,7 @@ EXTERN	Node*	throwreturn;
 extern	vlong	unmappedzero;
 
 /*
- * gen.c
+ * ggen.c
  */
 void	compile(Node*);
 void	proglist(void);
@@ -71,29 +74,31 @@ void	cgen_proc(Node*, int);
 void	cgen_callret(Node*, Node*);
 void	cgen_div(int, Node*, Node*, Node*);
 void	cgen_bmul(int, Node*, Node*, Node*);
-void	cgen_shift(int, Node*, Node*, Node*);
+void	cgen_hmul(Node*, Node*, Node*);
+void	cgen_shift(int, int, Node*, Node*, Node*);
 void	cgen_dcl(Node*);
 int	needconvert(Type*, Type*);
 void	genconv(Type*, Type*);
 void	allocparams(void);
-void	checklabels();
+void	checklabels(void);
 void	ginscall(Node*, int);
 int	gen_as_init(Node*);
+void	clearslim(Node*);
 
 /*
- * cgen
+ * cgen.c
  */
 void	agen(Node*, Node*);
+void	agenr(Node*, Node*, Node*);
+void	cgenr(Node*, Node*, Node*);
 void	igen(Node*, Node*, Node*);
 vlong	fieldoffset(Type*, Node*);
-void	bgen(Node*, int, Prog*);
 void	sgen(Node*, Node*, int64);
 void	gmove(Node*, Node*);
 Prog*	gins(int, Node*, Node*);
 int	samaddr(Node*, Node*);
 void	naddr(Node*, Addr*, int);
 void	cgen_aret(Node*, Node*);
-int	cgen_inline(Node*, Node*);
 void	restx(Node*, Node*);
 void	savex(int, Node*, Node*, Node*, Type*);
 int	componentgen(Node*, Node*);
@@ -103,9 +108,8 @@ int	componentgen(Node*, Node*);
  */
 void	clearp(Prog*);
 void	proglist(void);
-Prog*	gbranch(int, Type*);
+Prog*	gbranch(int, Type*, int);
 Prog*	prog(int);
-void	gaddoffset(Node*);
 void	gconv(int, int);
 int	conv2pt(Type*);
 vlong	convvtox(vlong, int);
@@ -126,9 +130,9 @@ Plist*	newplist(void);
 int	isfat(Type*);
 void	sudoclean(void);
 int	sudoaddable(int, Node*, Addr*);
-void	afunclit(Addr*);
-void	datagostring(Strlit*, Addr*);
+void	afunclit(Addr*, Node*);
 void	nodfconst(Node*, Type*, Mpflt*);
+void	gtrack(Sym*);
 
 /*
  * cplx.c
@@ -136,12 +140,12 @@ void	nodfconst(Node*, Type*, Mpflt*);
 int	complexop(Node*, Node*);
 void	complexmove(Node*, Node*);
 void	complexgen(Node*, Node*);
-void	complexbool(int, Node*, Node*, int, Prog*);
 
 /*
  * gobj.c
  */
 void	datastring(char*, int, Addr*);
+void	datagostring(Strlit*, Addr*);
 
 /*
  * list.c
diff --git a/src/cmd/6g/ggen.c b/src/cmd/6g/ggen.c
index 02e67d6d4..23bb5093f 100644
--- a/src/cmd/6g/ggen.c
+++ b/src/cmd/6g/ggen.c
@@ -25,6 +25,9 @@ void
 markautoused(Prog* p)
 {
 	for (; p; p = p->link) {
+		if (p->as == ATYPE)
+			continue;
+
 		if (p->from.type == D_AUTO && p->from.node)
 			p->from.node->used = 1;
 
@@ -35,14 +38,22 @@ markautoused(Prog* p)
 
 // Fixup instructions after compactframe has moved all autos around.
 void
-fixautoused(Prog* p)
+fixautoused(Prog *p)
 {
-	for (; p; p = p->link) {
+	Prog **lp;
+
+	for (lp=&p; (p=*lp) != P; ) {
+		if (p->as == ATYPE && p->from.node && p->from.type == D_AUTO && !p->from.node->used) {
+			*lp = p->link;
+			continue;
+		}
 		if (p->from.type == D_AUTO && p->from.node)
 			p->from.offset += p->from.node->stkdelta;
 
 		if (p->to.type == D_AUTO && p->to.node)
 			p->to.offset += p->to.node->stkdelta;
+
+		lp = &p->link;
 	}
 }
 
@@ -50,15 +61,18 @@ fixautoused(Prog* p)
 /*
  * generate:
  *	call f
+ *	proc=-1	normal call but no return
  *	proc=0	normal call
  *	proc=1	goroutine run in new proc
  *	proc=2	defer call save away stack
+  *	proc=3	normal call to C pointer (not Go func value)
  */
 void
 ginscall(Node *f, int proc)
 {
 	Prog *p;
 	Node reg, con;
+	Node r1;
 
 	switch(proc) {
 	default:
@@ -66,14 +80,38 @@ ginscall(Node *f, int proc)
 		break;
 
 	case 0:	// normal call
-		p = gins(ACALL, N, f);
-		afunclit(&p->to);
+	case -1:	// normal call but no return
+		if(f->op == ONAME && f->class == PFUNC) {
+			p = gins(ACALL, N, f);
+			afunclit(&p->to, f);
+			if(proc == -1 || noreturn(p))
+				gins(AUNDEF, N, N);
+			break;
+		}
+		nodreg(&reg, types[tptr], D_DX);
+		nodreg(&r1, types[tptr], D_BX);
+		gmove(f, &reg);
+		reg.op = OINDREG;
+		gmove(&reg, &r1);
+		reg.op = OREGISTER;
+		gins(ACALL, &reg, &r1);
+		break;
+	
+	case 3:	// normal call of c function pointer
+		gins(ACALL, N, f);
 		break;
 
 	case 1:	// call in new proc (go)
 	case 2:	// deferred call (defer)
 		nodreg(&reg, types[TINT64], D_CX);
-		gins(APUSHQ, f, N);
+		if(flag_largemodel) {
+			regalloc(&r1, f->type, f);
+			gmove(f, &r1);
+			gins(APUSHQ, &r1, N);
+			regfree(&r1);
+		} else {
+			gins(APUSHQ, f, N);
+		}
 		nodconst(&con, types[TINT32], argsize(f->type));
 		gins(APUSHQ, &con, N);
 		if(proc == 1)
@@ -88,7 +126,7 @@ ginscall(Node *f, int proc)
 		if(proc == 2) {
 			nodreg(&reg, types[TINT64], D_AX);
 			gins(ATESTQ, &reg, &reg);
-			patch(gbranch(AJNE, T), retpc);
+			patch(gbranch(AJNE, T, -1), retpc);
 		}
 		break;
 	}
@@ -102,7 +140,7 @@ void
 cgen_callinter(Node *n, Node *res, int proc)
 {
 	Node *i, *f;
-	Node tmpi, nodo, nodr, nodsp;
+	Node tmpi, nodi, nodo, nodr, nodsp;
 
 	i = n->left;
 	if(i->op != ODOTINTER)
@@ -122,21 +160,34 @@ cgen_callinter(Node *n, Node *res, int proc)
 
 	genlist(n->list);		// assign the args
 
-	regalloc(&nodr, types[tptr], res);
-	regalloc(&nodo, types[tptr], &nodr);
-	nodo.op = OINDREG;
-
-	agen(i, &nodr);         // REG = &inter
+	// i is now addable, prepare an indirected
+	// register to hold its address.
+	igen(i, &nodi, res);		// REG = &inter
 
 	nodindreg(&nodsp, types[tptr], D_SP);
-	nodo.xoffset += widthptr;
-	cgen(&nodo, &nodsp);	// 0(SP) = 8(REG) -- i.data
-
-	nodo.xoffset -= widthptr;
-	cgen(&nodo, &nodr);	// REG = 0(REG) -- i.tab
-
+	nodi.type = types[tptr];
+	nodi.xoffset += widthptr;
+	cgen(&nodi, &nodsp);	// 0(SP) = 8(REG) -- i.data
+
+	regalloc(&nodo, types[tptr], res);
+	nodi.type = types[tptr];
+	nodi.xoffset -= widthptr;
+	cgen(&nodi, &nodo);	// REG = 0(REG) -- i.tab
+	regfree(&nodi);
+
+	regalloc(&nodr, types[tptr], &nodo);
+	if(n->left->xoffset == BADWIDTH)
+		fatal("cgen_callinter: badwidth");
+	nodo.op = OINDREG;
 	nodo.xoffset = n->left->xoffset + 3*widthptr + 8;
-	cgen(&nodo, &nodr);	// REG = 32+offset(REG) -- i.tab->fun[f]
+	if(proc == 0) {
+		// plain call: use direct c function pointer - more efficient
+		cgen(&nodo, &nodr);	// REG = 32+offset(REG) -- i.tab->fun[f]
+		proc = 3;
+	} else {
+		// go/defer. generate go func value.
+		gins(ALEAQ, &nodo, &nodr);	// REG = &(32+offset(REG)) -- i.tab->fun[f]
+	}
 
 	// BOTCH nodr.type = fntype;
 	nodr.type = n->left->type;
@@ -182,7 +233,7 @@ cgen_call(Node *n, int proc)
 		nod.type = t;
 		ginscall(&nod, proc);
 		regfree(&nod);
-		goto ret;
+		return;
 	}
 
 	// call pointer
@@ -192,16 +243,12 @@ cgen_call(Node *n, int proc)
 		nod.type = t;
 		ginscall(&nod, proc);
 		regfree(&nod);
-		goto ret;
+		return;
 	}
 
 	// call direct
 	n->left->method = 1;
 	ginscall(n->left, proc);
-
-
-ret:
-	;
 }
 
 /*
@@ -389,7 +436,9 @@ cgen_asop(Node *n)
 hard:
 	n2.op = 0;
 	n1.op = 0;
-	if(nr->ullman >= nl->ullman || nl->addable) {
+	if(nr->op == OLITERAL) {
+		// don't allocate a register for literals.
+	} else if(nr->ullman >= nl->ullman || nl->addable) {
 		regalloc(&n2, nr->type, N);
 		cgen(nr, &n2);
 		nr = &n2;
@@ -447,10 +496,10 @@ void
 dodiv(int op, Node *nl, Node *nr, Node *res)
 {
 	int a, check;
-	Node n3, n4, n5;
+	Node n3, n4;
 	Type *t, *t0;
 	Node ax, dx, ax1, n31, oldax, olddx;
-	Prog *p1, *p2, *p3;
+	Prog *p1, *p2;
 
 	// Have to be careful about handling
 	// most negative int divided by -1 correctly.
@@ -501,30 +550,22 @@ dodiv(int op, Node *nl, Node *nr, Node *res)
 		gmove(&n31, &n3);
 	}
 
-	p3 = P;
+	p2 = P;
 	if(check) {
 		nodconst(&n4, t, -1);
 		gins(optoas(OCMP, t), &n3, &n4);
-		p1 = gbranch(optoas(ONE, t), T);
-		nodconst(&n4, t, -1LL<<(t->width*8-1));
-		if(t->width == 8) {
-			n5 = n4;
-			regalloc(&n4, t, N);
-			gins(AMOVQ, &n5, &n4);
-		}
-		gins(optoas(OCMP, t), &ax, &n4);
-		p2 = gbranch(optoas(ONE, t), T);
-		if(op == ODIV)
-			gmove(&n4, res);
-		if(t->width == 8)
-			regfree(&n4);
-		if(op == OMOD) {
+		p1 = gbranch(optoas(ONE, t), T, +1);
+		if(op == ODIV) {
+			// a / (-1) is -a.
+			gins(optoas(OMINUS, t), N, &ax);
+			gmove(&ax, res);
+		} else {
+			// a % (-1) is 0.
 			nodconst(&n4, t, 0);
 			gmove(&n4, res);
 		}
-		p3 = gbranch(AJMP, T);
+		p2 = gbranch(AJMP, T, 0);
 		patch(p1, pc);
-		patch(p2, pc);
 	}
 	savex(D_DX, &dx, &olddx, res, t);
 	if(!issigned[t->etype]) {
@@ -540,7 +581,7 @@ dodiv(int op, Node *nl, Node *nr, Node *res)
 		gmove(&dx, res);
 	restx(&dx, &olddx);
 	if(check)
-		patch(p3, pc);
+		patch(p2, pc);
 	restx(&ax, &oldax);
 }
 
@@ -594,134 +635,21 @@ restx(Node *x, Node *oldx)
 void
 cgen_div(int op, Node *nl, Node *nr, Node *res)
 {
-	Node n1, n2, n3, savl, savr;
-	Node ax, dx, oldax, olddx;
-	int n, w, s, a;
+	Node n1, n2, n3;
+	int w, a;
 	Magic m;
 
-	if(nl->ullman >= UINF) {
-		tempname(&savl, nl->type);
-		cgen(nl, &savl);
-		nl = &savl;
-	}
-	if(nr->ullman >= UINF) {
-		tempname(&savr, nr->type);
-		cgen(nr, &savr);
-		nr = &savr;
-	}
-
 	if(nr->op != OLITERAL)
 		goto longdiv;
-
-	// special cases of mod/div
-	// by a constant
 	w = nl->type->width*8;
-	s = 0;
-	n = powtwo(nr);
-	if(n >= 1000) {
-		// negative power of 2
-		s = 1;
-		n -= 1000;
-	}
-
-	if(n+1 >= w) {
-		// just sign bit
-		goto longdiv;
-	}
 
-	if(n < 0)
-		goto divbymul;
-	switch(n) {
-	case 0:
-		// divide by 1
-		regalloc(&n1, nl->type, res);
-		cgen(nl, &n1);
-		if(op == OMOD) {
-			gins(optoas(OXOR, nl->type), &n1, &n1);
-		} else
-		if(s)
-			gins(optoas(OMINUS, nl->type), N, &n1);
-		gmove(&n1, res);
-		regfree(&n1);
-		return;
-	case 1:
-		// divide by 2
-		if(op == OMOD) {
-			if(issigned[nl->type->etype])
-				goto longmod;
-			regalloc(&n1, nl->type, res);
-			cgen(nl, &n1);
-			nodconst(&n2, nl->type, 1);
-			gins(optoas(OAND, nl->type), &n2, &n1);
-			gmove(&n1, res);
-			regfree(&n1);
-			return;
-		}
-		regalloc(&n1, nl->type, res);
-		cgen(nl, &n1);
-		if(!issigned[nl->type->etype])
-			break;
-
-		// develop -1 iff nl is negative
-		regalloc(&n2, nl->type, N);
-		gmove(&n1, &n2);
-		nodconst(&n3, nl->type, w-1);
-		gins(optoas(ORSH, nl->type), &n3, &n2);
-		gins(optoas(OSUB, nl->type), &n2, &n1);
-		regfree(&n2);
-		break;
-	default:
-		if(op == OMOD) {
-			if(issigned[nl->type->etype])
-				goto longmod;
-			regalloc(&n1, nl->type, res);
-			cgen(nl, &n1);
-			nodconst(&n2, nl->type, mpgetfix(nr->val.u.xval)-1);
-			if(!smallintconst(&n2)) {
-				regalloc(&n3, nl->type, N);
-				gmove(&n2, &n3);
-				gins(optoas(OAND, nl->type), &n3, &n1);
-				regfree(&n3);
-			} else
-				gins(optoas(OAND, nl->type), &n2, &n1);
-			gmove(&n1, res);
-			regfree(&n1);
-			return;
-		}
-		regalloc(&n1, nl->type, res);
-		cgen(nl, &n1);
-		if(!issigned[nl->type->etype])
-			break;
-
-		// develop (2^k)-1 iff nl is negative
-		regalloc(&n2, nl->type, N);
-		gmove(&n1, &n2);
-		nodconst(&n3, nl->type, w-1);
-		gins(optoas(ORSH, nl->type), &n3, &n2);
-		nodconst(&n3, nl->type, w-n);
-		gins(optoas(ORSH, tounsigned(nl->type)), &n3, &n2);
-		gins(optoas(OADD, nl->type), &n2, &n1);
-		regfree(&n2);
-		break;
-	}
-	nodconst(&n2, nl->type, n);
-	gins(optoas(ORSH, nl->type), &n2, &n1);
-	if(s)
-		gins(optoas(OMINUS, nl->type), N, &n1);
-	gmove(&n1, res);
-	regfree(&n1);
-	return;
-
-divbymul:
+	// Front end handled 32-bit division. We only need to handle 64-bit.
 	// try to do division by multiply by (2^w)/d
 	// see hacker's delight chapter 10
 	switch(simtype[nl->type->etype]) {
 	default:
 		goto longdiv;
 
-	case TUINT8:
-	case TUINT16:
-	case TUINT32:
 	case TUINT64:
 		m.w = w;
 		m.ud = mpgetfix(nr->val.u.xval);
@@ -731,47 +659,28 @@ divbymul:
 		if(op == OMOD)
 			goto longmod;
 
-		regalloc(&n1, nl->type, N);
-		cgen(nl, &n1);				// num -> reg(n1)
-
-		savex(D_AX, &ax, &oldax, res, nl->type);
-		savex(D_DX, &dx, &olddx, res, nl->type);
-
+		cgenr(nl, &n1, N);
 		nodconst(&n2, nl->type, m.um);
-		gmove(&n2, &ax);			// const->ax
-
-		gins(optoas(OHMUL, nl->type), &n1, N);	// imul reg
-		if(w == 8) {
-			// fix up 8-bit multiply
-			Node ah, dl;
-			nodreg(&ah, types[TUINT8], D_AH);
-			nodreg(&dl, types[TUINT8], D_DL);
-			gins(AMOVB, &ah, &dl);
-		}
+		regalloc(&n3, nl->type, res);
+		cgen_hmul(&n1, &n2, &n3);
 
 		if(m.ua) {
 			// need to add numerator accounting for overflow
-			gins(optoas(OADD, nl->type), &n1, &dx);
+			gins(optoas(OADD, nl->type), &n1, &n3);
 			nodconst(&n2, nl->type, 1);
-			gins(optoas(ORRC, nl->type), &n2, &dx);
+			gins(optoas(ORROTC, nl->type), &n2, &n3);
 			nodconst(&n2, nl->type, m.s-1);
-			gins(optoas(ORSH, nl->type), &n2, &dx);
+			gins(optoas(ORSH, nl->type), &n2, &n3);
 		} else {
 			nodconst(&n2, nl->type, m.s);
-			gins(optoas(ORSH, nl->type), &n2, &dx);	// shift dx
+			gins(optoas(ORSH, nl->type), &n2, &n3);	// shift dx
 		}
 
-
+		gmove(&n3, res);
 		regfree(&n1);
-		gmove(&dx, res);
-
-		restx(&ax, &oldax);
-		restx(&dx, &olddx);
+		regfree(&n3);
 		return;
 
-	case TINT8:
-	case TINT16:
-	case TINT32:
 	case TINT64:
 		m.w = w;
 		m.sd = mpgetfix(nr->val.u.xval);
@@ -781,47 +690,32 @@ divbymul:
 		if(op == OMOD)
 			goto longmod;
 
-		regalloc(&n1, nl->type, N);
-		cgen(nl, &n1);				// num -> reg(n1)
-
-		savex(D_AX, &ax, &oldax, res, nl->type);
-		savex(D_DX, &dx, &olddx, res, nl->type);
-
+		cgenr(nl, &n1, res);
 		nodconst(&n2, nl->type, m.sm);
-		gmove(&n2, &ax);			// const->ax
-
-		gins(optoas(OHMUL, nl->type), &n1, N);	// imul reg
-		if(w == 8) {
-			// fix up 8-bit multiply
-			Node ah, dl;
-			nodreg(&ah, types[TUINT8], D_AH);
-			nodreg(&dl, types[TUINT8], D_DL);
-			gins(AMOVB, &ah, &dl);
-		}
+		regalloc(&n3, nl->type, N);
+		cgen_hmul(&n1, &n2, &n3);
 
 		if(m.sm < 0) {
 			// need to add numerator
-			gins(optoas(OADD, nl->type), &n1, &dx);
+			gins(optoas(OADD, nl->type), &n1, &n3);
 		}
 
 		nodconst(&n2, nl->type, m.s);
-		gins(optoas(ORSH, nl->type), &n2, &dx);	// shift dx
+		gins(optoas(ORSH, nl->type), &n2, &n3);	// shift n3
 
 		nodconst(&n2, nl->type, w-1);
 		gins(optoas(ORSH, nl->type), &n2, &n1);	// -1 iff num is neg
-		gins(optoas(OSUB, nl->type), &n1, &dx);	// added
+		gins(optoas(OSUB, nl->type), &n1, &n3);	// added
 
 		if(m.sd < 0) {
 			// this could probably be removed
 			// by factoring it into the multiplier
-			gins(optoas(OMINUS, nl->type), N, &dx);
+			gins(optoas(OMINUS, nl->type), N, &n3);
 		}
 
+		gmove(&n3, res);
 		regfree(&n1);
-		gmove(&dx, res);
-
-		restx(&ax, &oldax);
-		restx(&dx, &olddx);
+		regfree(&n3);
 		return;
 	}
 	goto longdiv;
@@ -858,12 +752,48 @@ longmod:
 }
 
 /*
+ * generate high multiply:
+ *   res = (nl*nr) >> width
+ */
+void
+cgen_hmul(Node *nl, Node *nr, Node *res)
+{
+	Type *t;
+	int a;
+	Node n1, n2, ax, dx, *tmp;
+
+	t = nl->type;
+	a = optoas(OHMUL, t);
+	if(nl->ullman < nr->ullman) {
+		tmp = nl;
+		nl = nr;
+		nr = tmp;
+	}
+	cgenr(nl, &n1, res);
+	cgenr(nr, &n2, N);
+	nodreg(&ax, t, D_AX);
+	gmove(&n1, &ax);
+	gins(a, &n2, N);
+	regfree(&n2);
+	regfree(&n1);
+
+	if(t->width == 1) {
+		// byte multiply behaves differently.
+		nodreg(&ax, t, D_AH);
+		nodreg(&dx, t, D_DL);
+		gmove(&ax, &dx);
+	}
+	nodreg(&dx, t, D_DX);
+	gmove(&dx, res);
+}
+
+/*
  * generate shift according to op, one of:
  *	res = nl << nr
  *	res = nl >> nr
  */
 void
-cgen_shift(int op, Node *nl, Node *nr, Node *res)
+cgen_shift(int op, int bounded, Node *nl, Node *nr, Node *res)
 {
 	Node n1, n2, n3, n4, n5, cx, oldcx;
 	int a, rcx;
@@ -878,7 +808,7 @@ cgen_shift(int op, Node *nl, Node *nr, Node *res)
 		cgen(nl, &n1);
 		sc = mpgetfix(nr->val.u.xval);
 		if(sc >= nl->type->width*8) {
-			// large shift gets 2 shifts by width
+			// large shift gets 2 shifts by width-1
 			nodconst(&n3, types[TUINT32], nl->type->width*8-1);
 			gins(a, &n3, &n1);
 			gins(a, &n3, &n1);
@@ -937,17 +867,20 @@ cgen_shift(int op, Node *nl, Node *nr, Node *res)
 	regfree(&n3);
 
 	// test and fix up large shifts
-	nodconst(&n3, tcount, nl->type->width*8);
-	gins(optoas(OCMP, tcount), &n1, &n3);
-	p1 = gbranch(optoas(OLT, tcount), T);
-	if(op == ORSH && issigned[nl->type->etype]) {
-		nodconst(&n3, types[TUINT32], nl->type->width*8-1);
-		gins(a, &n3, &n2);
-	} else {
-		nodconst(&n3, nl->type, 0);
-		gmove(&n3, &n2);
+	if(!bounded) {
+		nodconst(&n3, tcount, nl->type->width*8);
+		gins(optoas(OCMP, tcount), &n1, &n3);
+		p1 = gbranch(optoas(OLT, tcount), T, +1);
+		if(op == ORSH && issigned[nl->type->etype]) {
+			nodconst(&n3, types[TUINT32], nl->type->width*8-1);
+			gins(a, &n3, &n2);
+		} else {
+			nodconst(&n3, nl->type, 0);
+			gmove(&n3, &n2);
+		}
+		patch(p1, pc);
 	}
-	patch(p1, pc);
+
 	gins(a, &n1, &n2);
 
 	if(oldcx.op != 0) {
@@ -968,46 +901,40 @@ ret:
 /*
  * generate byte multiply:
  *	res = nl * nr
- * no 2-operand byte multiply instruction so have to do
- * 16-bit multiply and take bottom half.
+ * there is no 2-operand byte multiply instruction so
+ * we do a full-width multiplication and truncate afterwards.
  */
 void
 cgen_bmul(int op, Node *nl, Node *nr, Node *res)
 {
-	Node n1b, n2b, n1w, n2w;
+	Node n1, n2, n1b, n2b, *tmp;
 	Type *t;
 	int a;
 
-	if(nl->ullman >= nr->ullman) {
-		regalloc(&n1b, nl->type, res);
-		cgen(nl, &n1b);
-		regalloc(&n2b, nr->type, N);
-		cgen(nr, &n2b);
-	} else {
-		regalloc(&n2b, nr->type, N);
-		cgen(nr, &n2b);
-		regalloc(&n1b, nl->type, res);
-		cgen(nl, &n1b);
+	// largest ullman on left.
+	if(nl->ullman < nr->ullman) {
+		tmp = nl;
+		nl = nr;
+		nr = tmp;
 	}
 
-	// copy from byte to short registers
-	t = types[TUINT16];
-	if(issigned[nl->type->etype])
-		t = types[TINT16];
-
-	regalloc(&n2w, t, &n2b);
-	cgen(&n2b, &n2w);
-
-	regalloc(&n1w, t, &n1b);
-	cgen(&n1b, &n1w);
+	// generate operands in "8-bit" registers.
+	regalloc(&n1b, nl->type, res);
+	cgen(nl, &n1b);
+	regalloc(&n2b, nr->type, N);
+	cgen(nr, &n2b);
 
+	// perform full-width multiplication.
+	t = types[TUINT64];
+	if(issigned[nl->type->etype])
+		t = types[TINT64];
+	nodreg(&n1, t, n1b.val.u.reg);
+	nodreg(&n2, t, n2b.val.u.reg);
 	a = optoas(op, t);
-	gins(a, &n2w, &n1w);
-	cgen(&n1w, &n1b);
-	cgen(&n1b, res);
+	gins(a, &n2, &n1);
 
-	regfree(&n1w);
-	regfree(&n2w);
+	// truncate.
+	gmove(&n1, res);
 	regfree(&n1b);
 	regfree(&n2b);
 }
@@ -1024,9 +951,9 @@ clearfat(Node *nl)
 
 
 	w = nl->type->width;
-	if(w == 16)
-		if(componentgen(N, nl))
-			return;
+	// Avoid taking the address for simple enough types.
+	if(componentgen(N, nl))
+		return;
 
 	c = w % 8;	// bytes
 	q = w / 8;	// quads
@@ -1060,366 +987,3 @@ clearfat(Node *nl)
 	restx(&n1, &oldn1);
 	restx(&ax, &oldax);
 }
-
-static int
-regcmp(const void *va, const void *vb)
-{
-	Node *ra, *rb;
-
-	ra = (Node*)va;
-	rb = (Node*)vb;
-	return ra->local - rb->local;
-}
-
-static	Prog*	throwpc;
-
-void
-getargs(NodeList *nn, Node *reg, int n)
-{
-	NodeList *l;
-	int i;
-
-	throwpc = nil;
-
-	l = nn;
-	for(i=0; i<n; i++) {
-		if(!smallintconst(l->n->right) && !isslice(l->n->right->type)) {
-			regalloc(reg+i, l->n->right->type, N);
-			cgen(l->n->right, reg+i);
-		} else
-			reg[i] = *l->n->right;
-		if(reg[i].local != 0)
-			yyerror("local used");
-		reg[i].local = l->n->left->xoffset;
-		l = l->next;
-	}
-	qsort((void*)reg, n, sizeof(*reg), regcmp);
-	for(i=0; i<n; i++)
-		reg[i].local = 0;
-}
-
-void
-cmpandthrow(Node *nl, Node *nr)
-{
-	vlong cl;
-	Prog *p1;
-	int op;
-	Node *c;
-	Type *t;
-	Node n1;
-	
-	if(nl->op == OCONV && is64(nl->type))
-		nl = nl->left;
-	if(nr->op == OCONV && is64(nr->type))
-		nr = nr->left;
-
-	op = OLE;
-	if(smallintconst(nl)) {
-		cl = mpgetfix(nl->val.u.xval);
-		if(cl == 0)
-			return;
-		if(smallintconst(nr))
-			return;
-		// put the constant on the right
-		op = brrev(op);
-		c = nl;
-		nl = nr;
-		nr = c;
-	}
-	if(is64(nr->type) && smallintconst(nr))
-		nr->type = types[TUINT32];
-
-	n1.op = OXXX;
-	t = types[TUINT32];
-	if(nl->type->width != t->width || nr->type->width != t->width) {
-		if((is64(nl->type) && nl->op != OLITERAL) || (is64(nr->type) && nr->op != OLITERAL))
-			t = types[TUINT64];
-
-		// Check if we need to use a temporary.
-		// At least one of the arguments is 32 bits
-		// (the len or cap) so one temporary suffices.
-		if(nl->type->width != t->width && nl->op != OLITERAL) {
-			regalloc(&n1, t, nl);
-			gmove(nl, &n1);
-			nl = &n1;
-		} else if(nr->type->width != t->width && nr->op != OLITERAL) {
-			regalloc(&n1, t, nr);
-			gmove(nr, &n1);
-			nr = &n1;
-		}
-	}
-	gins(optoas(OCMP, t), nl, nr);
-	if(n1.op != OXXX)
-		regfree(&n1);
-	if(throwpc == nil) {
-		p1 = gbranch(optoas(op, t), T);
-		throwpc = pc;
-		ginscall(panicslice, 0);
-		patch(p1, pc);
-	} else {
-		op = brcom(op);
-		p1 = gbranch(optoas(op, t), T);
-		patch(p1, throwpc);
-	}
-}
-
-int
-sleasy(Node *n)
-{
-	if(n->op != ONAME)
-		return 0;
-	if(!n->addable)
-		return 0;
-	return 1;
-}
-
-// generate inline code for
-//	slicearray
-//	sliceslice
-//	arraytoslice
-int
-cgen_inline(Node *n, Node *res)
-{
-	Node nodes[5];
-	Node n1, n2, nres, ntemp;
-	vlong v;
-	int i, narg, nochk;
-
-	if(n->op != OCALLFUNC)
-		goto no;
-	if(!n->left->addable)
-		goto no;
-	if(n->left->sym == S)
-		goto no;
-	if(n->left->sym->pkg != runtimepkg)
-		goto no;
-	if(strcmp(n->left->sym->name, "slicearray") == 0)
-		goto slicearray;
-	if(strcmp(n->left->sym->name, "sliceslice") == 0) {
-		narg = 4;
-		goto sliceslice;
-	}
-	if(strcmp(n->left->sym->name, "sliceslice1") == 0) {
-		narg = 3;
-		goto sliceslice;
-	}
-	goto no;
-
-slicearray:
-	if(!sleasy(res))
-		goto no;
-	getargs(n->list, nodes, 5);
-
-	// if(hb[3] > nel[1]) goto throw
-	cmpandthrow(&nodes[3], &nodes[1]);
-
-	// if(lb[2] > hb[3]) goto throw
-	cmpandthrow(&nodes[2], &nodes[3]);
-
-	// len = hb[3] - lb[2] (destroys hb)
-	n2 = *res;
-	n2.xoffset += Array_nel;
-	n2.type = types[TUINT32];
-
-	if(smallintconst(&nodes[3]) && smallintconst(&nodes[2])) {
-		v = mpgetfix(nodes[3].val.u.xval) -
-			mpgetfix(nodes[2].val.u.xval);
-		nodconst(&n1, types[TUINT32], v);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-	} else {
-		regalloc(&n1, types[TUINT32], &nodes[3]);
-		gmove(&nodes[3], &n1);
-		if(!smallintconst(&nodes[2]) || mpgetfix(nodes[2].val.u.xval) != 0)
-			gins(optoas(OSUB, types[TUINT32]), &nodes[2], &n1);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		regfree(&n1);
-	}
-
-	// cap = nel[1] - lb[2] (destroys nel)
-	n2 = *res;
-	n2.xoffset += Array_cap;
-	n2.type = types[TUINT32];
-
-	if(smallintconst(&nodes[1]) && smallintconst(&nodes[2])) {
-		v = mpgetfix(nodes[1].val.u.xval) -
-			mpgetfix(nodes[2].val.u.xval);
-		nodconst(&n1, types[TUINT32], v);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-	} else {
-		regalloc(&n1, types[TUINT32], &nodes[1]);
-		gmove(&nodes[1], &n1);
-		if(!smallintconst(&nodes[2]) || mpgetfix(nodes[2].val.u.xval) != 0)
-			gins(optoas(OSUB, types[TUINT32]), &nodes[2], &n1);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		regfree(&n1);
-	}
-
-	// if slice could be too big, dereference to
-	// catch nil array pointer.
-	if(nodes[0].op == OREGISTER && nodes[0].type->type->width >= unmappedzero) {
-		n2 = nodes[0];
-		n2.xoffset = 0;
-		n2.op = OINDREG;
-		n2.type = types[TUINT8];
-		gins(ATESTB, nodintconst(0), &n2);
-	}
-
-	// ary = old[0] + (lb[2] * width[4]) (destroys old)
-	n2 = *res;
-	n2.xoffset += Array_array;
-	n2.type = types[tptr];
-
-	if(smallintconst(&nodes[2]) && smallintconst(&nodes[4])) {
-		v = mpgetfix(nodes[2].val.u.xval) *
-			mpgetfix(nodes[4].val.u.xval);
-		if(v != 0)
-			ginscon(optoas(OADD, types[tptr]), v, &nodes[0]);
-	} else {
-		regalloc(&n1, types[tptr], &nodes[2]);
-		gmove(&nodes[2], &n1);
-		if(!smallintconst(&nodes[4]) || mpgetfix(nodes[4].val.u.xval) != 1)
-			gins(optoas(OMUL, types[tptr]), &nodes[4], &n1);
-		gins(optoas(OADD, types[tptr]), &n1, &nodes[0]);
-		regfree(&n1);
-	}
-	gins(optoas(OAS, types[tptr]), &nodes[0], &n2);
-
-	for(i=0; i<5; i++) {
-		if(nodes[i].op == OREGISTER)
-			regfree(&nodes[i]);
-	}
-	return 1;
-
-sliceslice:
-	nochk = n->etype;  // skip bounds checking
-	ntemp.op = OXXX;
-	if(!sleasy(n->list->n->right)) {
-		Node *n0;
-		
-		n0 = n->list->n->right;
-		tempname(&ntemp, res->type);
-		cgen(n0, &ntemp);
-		n->list->n->right = &ntemp;
-		getargs(n->list, nodes, narg);
-		n->list->n->right = n0;
-	} else
-		getargs(n->list, nodes, narg);
-
-	nres = *res;		// result
-	if(!sleasy(res)) {
-		if(ntemp.op == OXXX)
-			tempname(&ntemp, res->type);
-		nres = ntemp;
-	}
-
-	if(narg == 3) {	// old[lb:]
-		// move width to where it would be for old[lb:hb]
-		nodes[3] = nodes[2];
-		nodes[2].op = OXXX;
-		
-		// if(lb[1] > old.nel[0]) goto throw;
-		n2 = nodes[0];
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-		if(!nochk)
-			cmpandthrow(&nodes[1], &n2);
-
-		// ret.nel = old.nel[0]-lb[1];
-		n2 = nodes[0];
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-	
-		regalloc(&n1, types[TUINT32], N);
-		gins(optoas(OAS, types[TUINT32]), &n2, &n1);
-		if(!smallintconst(&nodes[1]) || mpgetfix(nodes[1].val.u.xval) != 0)
-			gins(optoas(OSUB, types[TUINT32]), &nodes[1], &n1);
-	
-		n2 = nres;
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		regfree(&n1);
-	} else {	// old[lb:hb]
-		n2 = nodes[0];
-		n2.xoffset += Array_cap;
-		n2.type = types[TUINT32];
-		if(!nochk) {
-			// if(hb[2] > old.cap[0]) goto throw;
-			cmpandthrow(&nodes[2], &n2);
-			// if(lb[1] > hb[2]) goto throw;
-			cmpandthrow(&nodes[1], &nodes[2]);
-		}
-		// ret.len = hb[2]-lb[1]; (destroys hb[2])
-		n2 = nres;
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-
-		if(smallintconst(&nodes[2]) && smallintconst(&nodes[1])) {
-			v = mpgetfix(nodes[2].val.u.xval) -
-				mpgetfix(nodes[1].val.u.xval);
-			nodconst(&n1, types[TUINT32], v);
-			gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		} else {
-			regalloc(&n1, types[TUINT32], &nodes[2]);
-			gmove(&nodes[2], &n1);
-			if(!smallintconst(&nodes[1]) || mpgetfix(nodes[1].val.u.xval) != 0)
-				gins(optoas(OSUB, types[TUINT32]), &nodes[1], &n1);
-			gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-			regfree(&n1);
-		}
-	}
-
-	// ret.cap = old.cap[0]-lb[1]; (uses hb[2])
-	n2 = nodes[0];
-	n2.xoffset += Array_cap;
-	n2.type = types[TUINT32];
-
-	regalloc(&n1, types[TUINT32], &nodes[2]);
-	gins(optoas(OAS, types[TUINT32]), &n2, &n1);
-	if(!smallintconst(&nodes[1]) || mpgetfix(nodes[1].val.u.xval) != 0)
-		gins(optoas(OSUB, types[TUINT32]), &nodes[1], &n1);
-
-	n2 = nres;
-	n2.xoffset += Array_cap;
-	n2.type = types[TUINT32];
-
-	gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-	regfree(&n1);
-
-	// ret.array = old.array[0]+lb[1]*width[3]; (uses lb[1])
-	n2 = nodes[0];
-	n2.xoffset += Array_array;
-	n2.type = types[tptr];
-	regalloc(&n1, types[tptr], &nodes[1]);
-	if(smallintconst(&nodes[1]) && smallintconst(&nodes[3])) {
-		gins(optoas(OAS, types[tptr]), &n2, &n1);
-		v = mpgetfix(nodes[1].val.u.xval) *
-			mpgetfix(nodes[3].val.u.xval);
-		if(v != 0)
-			ginscon(optoas(OADD, types[tptr]), v, &n1);
-	} else {
-		gmove(&nodes[1], &n1);
-		if(!smallintconst(&nodes[3]) || mpgetfix(nodes[3].val.u.xval) != 1)
-			gins(optoas(OMUL, types[tptr]), &nodes[3], &n1);
-		gins(optoas(OADD, types[tptr]), &n2, &n1);
-	}
-
-	n2 = nres;
-	n2.xoffset += Array_array;
-	n2.type = types[tptr];
-	gins(optoas(OAS, types[tptr]), &n1, &n2);
-	regfree(&n1);
-
-	for(i=0; i<4; i++) {
-		if(nodes[i].op == OREGISTER)
-			regfree(&nodes[i]);
-	}
-
-	if(!sleasy(res)) {
-		cgen(&nres, res);
-	}
-	return 1;
-
-no:
-	return 0;
-}
diff --git a/src/cmd/6g/gobj.c b/src/cmd/6g/gobj.c
index 8c9208374..508a3548f 100644
--- a/src/cmd/6g/gobj.c
+++ b/src/cmd/6g/gobj.c
@@ -94,9 +94,9 @@ zaddr(Biobuf *b, Addr *a, int s, int gotype)
 	switch(a->type) {
 
 	case D_BRANCH:
-		if(a->branch == nil)
+		if(a->u.branch == nil)
 			fatal("unpatched branch");
-		a->offset = a->branch->loc;
+		a->offset = a->u.branch->loc;
 
 	default:
 		t |= T_TYPE;
@@ -139,7 +139,7 @@ zaddr(Biobuf *b, Addr *a, int s, int gotype)
 	if(t & T_SYM)		/* implies sym */
 		Bputc(b, s);
 	if(t & T_FCONST) {
-		ieeedtod(&e, a->dval);
+		ieeedtod(&e, a->u.dval);
 		l = e;
 		Bputc(b, l);
 		Bputc(b, l>>8);
@@ -153,7 +153,7 @@ zaddr(Biobuf *b, Addr *a, int s, int gotype)
 		return;
 	}
 	if(t & T_SCONST) {
-		n = a->sval;
+		n = a->u.sval;
 		for(i=0; i<NSNAME; i++) {
 			Bputc(b, *n);
 			n++;
@@ -295,7 +295,7 @@ dsname(Sym *s, int off, char *t, int n)
 	
 	p->to.type = D_SCONST;
 	p->to.index = D_NONE;
-	memmove(p->to.sval, t, n);
+	memmove(p->to.u.sval, t, n);
 	return off + n;
 }
 
@@ -312,8 +312,8 @@ datastring(char *s, int len, Addr *a)
 	a->type = D_EXTERN;
 	a->sym = sym;
 	a->node = sym->def;
-	a->offset = widthptr+4;  // skip header
-	a->etype = TINT32;
+	a->offset = widthptr+widthint;  // skip header
+	a->etype = simtype[TINT];
 }
 
 /*
@@ -324,7 +324,7 @@ void
 datagostring(Strlit *sval, Addr *a)
 {
 	Sym *sym;
-	
+
 	sym = stringsym(sval->s, sval->len);
 	a->type = D_EXTERN;
 	a->sym = sym;
@@ -364,13 +364,13 @@ gdatacomplex(Node *nam, Mpcplx *cval)
 	p = gins(ADATA, nam, N);
 	p->from.scale = w;
 	p->to.type = D_FCONST;
-	p->to.dval = mpgetflt(&cval->real);
+	p->to.u.dval = mpgetflt(&cval->real);
 
 	p = gins(ADATA, nam, N);
 	p->from.scale = w;
 	p->from.offset += w;
 	p->to.type = D_FCONST;
-	p->to.dval = mpgetflt(&cval->imag);
+	p->to.u.dval = mpgetflt(&cval->imag);
 }
 
 void
@@ -386,10 +386,10 @@ gdatastring(Node *nam, Strlit *sval)
 	p->to.type = D_ADDR;
 //print("%P\n", p);
 
-	nodconst(&nod1, types[TINT32], sval->len);
+	nodconst(&nod1, types[TINT], sval->len);
 	p = gins(ADATA, nam, &nod1);
-	p->from.scale = types[TINT32]->width;
-	p->from.offset += types[tptr]->width;
+	p->from.scale = widthint;
+	p->from.offset += widthptr;
 }
 
 int
@@ -408,7 +408,7 @@ dstringptr(Sym *s, int off, char *str)
 	datastring(str, strlen(str)+1, &p->to);
 	p->to.index = p->to.type;
 	p->to.type = D_ADDR;
-	p->to.etype = TINT32;
+	p->to.etype = simtype[TINT];
 	off += widthptr;
 
 	return off;
@@ -432,7 +432,7 @@ dgostrlitptr(Sym *s, int off, Strlit *lit)
 	datagostring(lit, &p->to);
 	p->to.index = p->to.type;
 	p->to.type = D_ADDR;
-	p->to.etype = TINT32;
+	p->to.etype = simtype[TINT];
 	off += widthptr;
 
 	return off;
diff --git a/src/cmd/6g/gsubr.c b/src/cmd/6g/gsubr.c
index ededcf673..fc5407a1f 100644
--- a/src/cmd/6g/gsubr.c
+++ b/src/cmd/6g/gsubr.c
@@ -103,9 +103,13 @@ dumpdata(void)
 /*
  * generate a branch.
  * t is ignored.
+ * likely values are for branch prediction:
+ *	-1 unlikely
+ *	0 no opinion
+ *	+1 likely
  */
 Prog*
-gbranch(int as, Type *t)
+gbranch(int as, Type *t, int likely)
 {
 	Prog *p;
 	
@@ -113,7 +117,11 @@ gbranch(int as, Type *t)
 
 	p = prog(as);
 	p->to.type = D_BRANCH;
-	p->to.branch = P;
+	p->to.u.branch = P;
+	if(as != AJMP && likely != 0) {
+		p->from.type = D_CONST;
+		p->from.offset = likely > 0;
+	}
 	return p;
 }
 
@@ -125,7 +133,7 @@ patch(Prog *p, Prog *to)
 {
 	if(p->to.type != D_BRANCH)
 		fatal("patch: not a branch");
-	p->to.branch = to;
+	p->to.u.branch = to;
 	p->to.offset = to->loc;
 }
 
@@ -136,8 +144,8 @@ unpatch(Prog *p)
 
 	if(p->to.type != D_BRANCH)
 		fatal("unpatch: not a branch");
-	q = p->to.branch;
-	p->to.branch = P;
+	q = p->to.u.branch;
+	p->to.u.branch = P;
 	p->to.offset = 0;
 	return q;
 }
@@ -165,44 +173,6 @@ newplist(void)
 }
 
 void
-clearstk(void)
-{
-	Plist *pl;
-	Prog *p1, *p2;
-	Node sp, di, cx, con, ax;
-
-	if((uint32)plast->firstpc->to.offset <= 0)
-		return;
-
-	// reestablish context for inserting code
-	// at beginning of function.
-	pl = plast;
-	p1 = pl->firstpc;
-	p2 = p1->link;
-	pc = mal(sizeof(*pc));
-	clearp(pc);
-	p1->link = pc;
-	
-	// zero stack frame
-	nodreg(&sp, types[tptr], D_SP);
-	nodreg(&di, types[tptr], D_DI);
-	nodreg(&cx, types[TUINT64], D_CX);
-	nodconst(&con, types[TUINT64], (uint32)p1->to.offset / widthptr);
-	gins(ACLD, N, N);
-	gins(AMOVQ, &sp, &di);
-	gins(AMOVQ, &con, &cx);
-	nodconst(&con, types[TUINT64], 0);
-	nodreg(&ax, types[TUINT64], D_AX);
-	gins(AMOVQ, &con, &ax);
-	gins(AREP, N, N);
-	gins(ASTOSQ, N, N);
-
-	// continue with original code.
-	gins(ANOP, N, N)->link = p2;
-	pc = P;
-}	
-
-void
 gused(Node *n)
 {
 	gins(ANOP, n, N);	// used
@@ -213,22 +183,23 @@ gjmp(Prog *to)
 {
 	Prog *p;
 
-	p = gbranch(AJMP, T);
+	p = gbranch(AJMP, T, 0);
 	if(to != P)
 		patch(p, to);
 	return p;
 }
 
 void
-ggloblnod(Node *nam, int32 width)
+ggloblnod(Node *nam)
 {
 	Prog *p;
 
 	p = gins(AGLOBL, nam, N);
 	p->lineno = nam->lineno;
+	p->from.gotype = ngotype(nam);
 	p->to.sym = S;
 	p->to.type = D_CONST;
-	p->to.offset = width;
+	p->to.offset = nam->type->width;
 	if(nam->readonly)
 		p->from.scale = RODATA;
 	if(nam->type != T && !haspointers(nam->type))
@@ -236,7 +207,18 @@ ggloblnod(Node *nam, int32 width)
 }
 
 void
-ggloblsym(Sym *s, int32 width, int dupok)
+gtrack(Sym *s)
+{
+	Prog *p;
+	
+	p = gins(AUSEFIELD, N, N);
+	p->from.type = D_EXTERN;
+	p->from.index = D_NONE;
+	p->from.sym = s;
+}
+
+void
+ggloblsym(Sym *s, int32 width, int dupok, int rodata)
 {
 	Prog *p;
 
@@ -248,8 +230,9 @@ ggloblsym(Sym *s, int32 width, int dupok)
 	p->to.index = D_NONE;
 	p->to.offset = width;
 	if(dupok)
-		p->from.scale = DUPOK;
-	p->from.scale |= RODATA;
+		p->from.scale |= DUPOK;
+	if(rodata)
+		p->from.scale |= RODATA;
 }
 
 int
@@ -272,11 +255,12 @@ isfat(Type *t)
  * call afunclit to fix up the argument.
  */
 void
-afunclit(Addr *a)
+afunclit(Addr *a, Node *n)
 {
 	if(a->type == D_ADDR && a->index == D_EXTERN) {
 		a->type = D_EXTERN;
 		a->index = D_NONE;
+		a->sym = n->sym;
 	}
 }
 
@@ -300,7 +284,7 @@ ginit(void)
 		reg[i] = 1;
 	for(i=D_AX; i<=D_R15; i++)
 		reg[i] = 0;
-	for(i=D_X0; i<=D_X7; i++)
+	for(i=D_X0; i<=D_X15; i++)
 		reg[i] = 0;
 
 	for(i=0; i<nelem(resvd); i++)
@@ -318,7 +302,7 @@ gclean(void)
 	for(i=D_AX; i<=D_R15; i++)
 		if(reg[i])
 			yyerror("reg %R left allocated\n", i);
-	for(i=D_X0; i<=D_X7; i++)
+	for(i=D_X0; i<=D_X15; i++)
 		if(reg[i])
 			yyerror("reg %R left allocated\n", i);
 }
@@ -388,10 +372,10 @@ regalloc(Node *n, Type *t, Node *o)
 	case TFLOAT64:
 		if(o != N && o->op == OREGISTER) {
 			i = o->val.u.reg;
-			if(i >= D_X0 && i <= D_X7)
+			if(i >= D_X0 && i <= D_X15)
 				goto out;
 		}
-		for(i=D_X0; i<=D_X7; i++)
+		for(i=D_X0; i<=D_X15; i++)
 			if(reg[i] == 0)
 				goto out;
 		fatal("out of floating registers");
@@ -572,6 +556,10 @@ ismem(Node *n)
 	case ONAME:
 	case OPARAM:
 		return 1;
+	case OADDR:
+		if(flag_largemodel)
+			return 1;
+		break;
 	}
 	return 0;
 }
@@ -616,7 +604,7 @@ gmove(Node *f, Node *t)
 	Prog *p1, *p2;
 
 	if(debug['M'])
-		print("gmove %N -> %N\n", f, t);
+		print("gmove %lN -> %lN\n", f, t);
 
 	ft = simsimtype(f->type);
 	tt = simsimtype(t->type);
@@ -706,11 +694,14 @@ gmove(Node *f, Node *t)
 	case CASE(TINT32, TUINT32):
 	case CASE(TUINT32, TINT32):
 	case CASE(TUINT32, TUINT32):
+		a = AMOVL;
+		break;
+
 	case CASE(TINT64, TINT32):	// truncate
 	case CASE(TUINT64, TINT32):
 	case CASE(TINT64, TUINT32):
 	case CASE(TUINT64, TUINT32):
-		a = AMOVL;
+		a = AMOVQL;
 		break;
 
 	case CASE(TINT64, TINT64):	// same size
@@ -822,9 +813,9 @@ gmove(Node *f, Node *t)
 		// algorithm is:
 		//	if small enough, use native float64 -> int64 conversion.
 		//	otherwise, subtract 2^63, convert, and add it back.
-		a = ACVTSS2SQ;
+		a = ACVTTSS2SQ;
 		if(ft == TFLOAT64)
-			a = ACVTSD2SQ;
+			a = ACVTTSD2SQ;
 		bignodes();
 		regalloc(&r1, types[ft], N);
 		regalloc(&r2, types[tt], t);
@@ -832,9 +823,9 @@ gmove(Node *f, Node *t)
 		regalloc(&r4, types[tt], N);
 		gins(optoas(OAS, f->type), f, &r1);
 		gins(optoas(OCMP, f->type), &bigf, &r1);
-		p1 = gbranch(optoas(OLE, f->type), T);
+		p1 = gbranch(optoas(OLE, f->type), T, +1);
 		gins(a, &r1, &r2);
-		p2 = gbranch(AJMP, T);
+		p2 = gbranch(AJMP, T, 0);
 		patch(p1, pc);
 		gins(optoas(OAS, f->type), &bigf, &r3);
 		gins(optoas(OSUB, f->type), &r3, &r1);
@@ -903,9 +894,9 @@ gmove(Node *f, Node *t)
 		regalloc(&r4, f->type, N);
 		gmove(f, &r1);
 		gins(ACMPQ, &r1, &zero);
-		p1 = gbranch(AJLT, T);
+		p1 = gbranch(AJLT, T, +1);
 		gins(a, &r1, &r2);
-		p2 = gbranch(AJMP, T);
+		p2 = gbranch(AJMP, T, 0);
 		patch(p1, pc);
 		gmove(&r1, &r3);
 		gins(ASHRQ, &one, &r3);
@@ -1016,6 +1007,13 @@ gins(int as, Node *f, Node *t)
 	case AMOVSD:
 		if(f != N && t != N && samaddr(f, t))
 			return nil;
+		break;
+	
+	case ALEAQ:
+		if(f != N && isconst(f, CTNIL)) {
+			fatal("gins LEAQ nil %T", f->type);
+		}
+		break;
 	}
 
 	memset(&af, 0, sizeof af);
@@ -1047,13 +1045,34 @@ gins(int as, Node *f, Node *t)
 		w = 8;
 		break;
 	}
-	if(w != 0 && f != N && (af.width > w || at.width > w)) {
+	if(w != 0 && ((f != N && af.width < w) || (t != N && at.width > w))) {
+		dump("f", f);
+		dump("t", t);
 		fatal("bad width: %P (%d, %d)\n", p, af.width, at.width);
 	}
 
 	return p;
 }
 
+// Generate an instruction referencing *n
+// to force segv on nil pointer dereference.
+void
+checkref(Node *n)
+{
+	Node m;
+
+	if(n->type->type->width < unmappedzero)
+		return;
+
+	regalloc(&m, types[TUINTPTR], n);
+	cgen(n, &m);
+	m.xoffset = 0;
+	m.op = OINDREG;
+	m.type = types[TUINT8];
+	gins(ATESTB, nodintconst(0), &m);
+	regfree(&m);
+}
+
 static void
 checkoffset(Addr *a, int canemitcode)
 {
@@ -1079,14 +1098,22 @@ checkoffset(Addr *a, int canemitcode)
 void
 naddr(Node *n, Addr *a, int canemitcode)
 {
+	Prog *p;
+
 	a->scale = 0;
 	a->index = D_NONE;
 	a->type = D_NONE;
 	a->gotype = S;
 	a->node = N;
+	a->width = 0;
 	if(n == N)
 		return;
 
+	if(n->type != T && n->type->etype != TIDEAL) {
+		dowidth(n->type);
+		a->width = n->type->width;
+	}
+
 	switch(n->op) {
 	default:
 		fatal("naddr: bad %O %D", n->op, a);
@@ -1134,15 +1161,27 @@ naddr(Node *n, Addr *a, int canemitcode)
 		a->type = D_PARAM;
 		a->node = n->left->orig;
 		break;
+	
+	case OCLOSUREVAR:
+		if(!canemitcode)
+			fatal("naddr OCLOSUREVAR cannot emit code");
+		p = gins(AMOVQ, N, N);
+		p->from.type = D_DX+D_INDIR;
+		p->from.offset = n->xoffset;
+		p->to.type = D_BX;
+		a->type = D_BX;
+		a->sym = S;
+		break;
+	
+	case OCFUNC:
+		naddr(n->left, a, canemitcode);
+		a->sym = n->left->sym;
+		break;
 
 	case ONAME:
 		a->etype = 0;
-		a->width = 0;
-		if(n->type != T) {
+		if(n->type != T)
 			a->etype = simtype[n->type->etype];
-			a->width = n->type->width;
-			a->gotype = ngotype(n);
-		}
 		a->offset = n->xoffset;
 		a->sym = n->sym;
 		a->node = n->orig;
@@ -1173,6 +1212,8 @@ naddr(Node *n, Addr *a, int canemitcode)
 		case PFUNC:
 			a->index = D_EXTERN;
 			a->type = D_ADDR;
+			a->width = widthptr;
+			a->sym = funcsym(a->sym);
 			break;
 		}
 		break;
@@ -1184,7 +1225,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 			break;
 		case CTFLT:
 			a->type = D_FCONST;
-			a->dval = mpgetflt(n->val.u.fval);
+			a->u.dval = mpgetflt(n->val.u.fval);
 			break;
 		case CTINT:
 		case CTRUNE:
@@ -1210,6 +1251,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 
 	case OADDR:
 		naddr(n->left, a, canemitcode);
+		a->width = widthptr;
 		if(a->type >= D_INDIR) {
 			a->type -= D_INDIR;
 			break;
@@ -1239,9 +1281,9 @@ naddr(Node *n, Addr *a, int canemitcode)
 		naddr(n->left, a, canemitcode);
 		if(a->type == D_CONST && a->offset == 0)
 			break;	// len(nil)
-		a->etype = TUINT32;
+		a->etype = simtype[TUINT];
 		a->offset += Array_nel;
-		a->width = 4;
+		a->width = widthint;
 		if(a->offset >= unmappedzero && a->offset-Array_nel < unmappedzero)
 			checkoffset(a, canemitcode);
 		break;
@@ -1251,9 +1293,9 @@ naddr(Node *n, Addr *a, int canemitcode)
 		naddr(n->left, a, canemitcode);
 		if(a->type == D_CONST && a->offset == 0)
 			break;	// cap(nil)
-		a->etype = TUINT32;
+		a->etype = simtype[TUINT];
 		a->offset += Array_cap;
-		a->width = 4;
+		a->width = widthint;
 		if(a->offset >= unmappedzero && a->offset-Array_cap < unmappedzero)
 			checkoffset(a, canemitcode);
 		break;
@@ -1645,6 +1687,28 @@ optoas(int op, Type *t)
 		a = AXORQ;
 		break;
 
+	case CASE(OLROT, TINT8):
+	case CASE(OLROT, TUINT8):
+		a = AROLB;
+		break;
+
+	case CASE(OLROT, TINT16):
+	case CASE(OLROT, TUINT16):
+		a = AROLW;
+		break;
+
+	case CASE(OLROT, TINT32):
+	case CASE(OLROT, TUINT32):
+	case CASE(OLROT, TPTR32):
+		a = AROLL;
+		break;
+
+	case CASE(OLROT, TINT64):
+	case CASE(OLROT, TUINT64):
+	case CASE(OLROT, TPTR64):
+		a = AROLQ;
+		break;
+
 	case CASE(OLSH, TINT8):
 	case CASE(OLSH, TUINT8):
 		a = ASHLB;
@@ -1701,23 +1765,23 @@ optoas(int op, Type *t)
 		a = ASARQ;
 		break;
 
-	case CASE(ORRC, TINT8):
-	case CASE(ORRC, TUINT8):
+	case CASE(ORROTC, TINT8):
+	case CASE(ORROTC, TUINT8):
 		a = ARCRB;
 		break;
 
-	case CASE(ORRC, TINT16):
-	case CASE(ORRC, TUINT16):
+	case CASE(ORROTC, TINT16):
+	case CASE(ORROTC, TUINT16):
 		a = ARCRW;
 		break;
 
-	case CASE(ORRC, TINT32):
-	case CASE(ORRC, TUINT32):
+	case CASE(ORROTC, TINT32):
+	case CASE(ORROTC, TUINT32):
 		a = ARCRL;
 		break;
 
-	case CASE(ORRC, TINT64):
-	case CASE(ORRC, TUINT64):
+	case CASE(ORROTC, TINT64):
+	case CASE(ORROTC, TUINT64):
 		a = ARCRQ;
 		break;
 
@@ -1919,6 +1983,9 @@ sudoaddable(int as, Node *n, Addr *a)
 		goto odot;
 
 	case OINDEX:
+		return 0;
+		// disabled: OINDEX case is now covered by agenr
+		// for a more suitable register allocation pattern.
 		if(n->left->type->etype == TSTRING)
 			return 0;
 		goto oindex;
@@ -2053,32 +2120,20 @@ oindex:
 	}
 
 	// check bounds
-	if(!debug['B'] && !n->etype) {
+	if(!debug['B'] && !n->bounded) {
 		// check bounds
 		n4.op = OXXX;
-		t = types[TUINT32];
+		t = types[simtype[TUINT]];
 		if(o & ODynam) {
 			if(o & OAddable) {
 				n2 = *l;
 				n2.xoffset += Array_nel;
-				n2.type = types[TUINT32];
-				if(is64(r->type)) {
-					t = types[TUINT64];
-					regalloc(&n4, t, N);
-					gmove(&n2, &n4);
-					n2 = n4;
-				}
+				n2.type = types[simtype[TUINT]];
 			} else {
 				n2 = *reg;
 				n2.xoffset = Array_nel;
 				n2.op = OINDREG;
-				n2.type = types[TUINT32];
-				if(is64(r->type)) {
-					t = types[TUINT64];
-					regalloc(&n4, t, N);
-					gmove(&n2, &n4);
-					n2 = n4;
-				}
+				n2.type = types[simtype[TUINT]];
 			}
 		} else {
 			if(is64(r->type))
@@ -2086,10 +2141,10 @@ oindex:
 			nodconst(&n2, types[TUINT64], l->type->bound);
 		}
 		gins(optoas(OCMP, t), reg1, &n2);
-		p1 = gbranch(optoas(OLT, t), T);
+		p1 = gbranch(optoas(OLT, t), T, +1);
 		if(n4.op != OXXX)
 			regfree(&n4);
-		ginscall(panicindex, 0);
+		ginscall(panicindex, -1);
 		patch(p1, pc);
 	}
 
@@ -2140,19 +2195,19 @@ oindex_const:
 	reg->op = OEMPTY;
 	reg1->op = OEMPTY;
 
-	regalloc(reg, types[tptr], N);
-	agen(l, reg);
-
 	if(o & ODynam) {
-		if(!debug['B'] && !n->etype) {
+		regalloc(reg, types[tptr], N);
+		agen(l, reg);
+	
+		if(!debug['B'] && !n->bounded) {
 			n1 = *reg;
 			n1.op = OINDREG;
 			n1.type = types[tptr];
 			n1.xoffset = Array_nel;
 			nodconst(&n2, types[TUINT64], v);
-			gins(optoas(OCMP, types[TUINT32]), &n1, &n2);
-			p1 = gbranch(optoas(OGT, types[TUINT32]), T);
-			ginscall(panicindex, 0);
+			gins(optoas(OCMP, types[simtype[TUINT]]), &n1, &n2);
+			p1 = gbranch(optoas(OGT, types[simtype[TUINT]]), T, +1);
+			ginscall(panicindex, -1);
 			patch(p1, pc);
 		}
 
@@ -2162,14 +2217,24 @@ oindex_const:
 		n1.xoffset = Array_array;
 		gmove(&n1, reg);
 
+		n2 = *reg;
+		n2.op = OINDREG;
+		n2.xoffset = v*w;
+		a->type = D_NONE;
+		a->index = D_NONE;
+		naddr(&n2, a, 1);
+		goto yes;
 	}
-
-	n2 = *reg;
-	n2.op = OINDREG;
-	n2.xoffset = v*w;
+	
+	igen(l, &n1, N);
+	if(n1.op == OINDREG) {
+		*reg = n1;
+		reg->op = OREGISTER;
+	}
+	n1.xoffset += v*w;
 	a->type = D_NONE;
-	a->index = D_NONE;
-	naddr(&n2, a, 1);
+	a->index= D_NONE;
+	naddr(&n1, a, 1);
 	goto yes;
 
 oindex_const_sudo:
@@ -2180,13 +2245,13 @@ oindex_const_sudo:
 	}
 
 	// slice indexed by a constant
-	if(!debug['B'] && !n->etype) {
+	if(!debug['B'] && !n->bounded) {
 		a->offset += Array_nel;
 		nodconst(&n2, types[TUINT64], v);
-		p1 = gins(optoas(OCMP, types[TUINT32]), N, &n2);
+		p1 = gins(optoas(OCMP, types[simtype[TUINT]]), N, &n2);
 		p1->from = *a;
-		p1 = gbranch(optoas(OGT, types[TUINT32]), T);
-		ginscall(panicindex, 0);
+		p1 = gbranch(optoas(OGT, types[simtype[TUINT]]), T, +1);
+		ginscall(panicindex, -1);
 		patch(p1, pc);
 		a->offset -= Array_nel;
 	}
diff --git a/src/cmd/6g/list.c b/src/cmd/6g/list.c
index ad63f7d29..9d27a6a09 100644
--- a/src/cmd/6g/list.c
+++ b/src/cmd/6g/list.c
@@ -107,10 +107,10 @@ Dconv(Fmt *fp)
 		break;
 
 	case D_BRANCH:
-		if(a->branch == nil)
+		if(a->u.branch == nil)
 			snprint(str, sizeof(str), "<nil>");
 		else
-			snprint(str, sizeof(str), "%d", a->branch->loc);
+			snprint(str, sizeof(str), "%d", a->u.branch->loc);
 		break;
 
 	case D_EXTERN:
@@ -140,11 +140,11 @@ Dconv(Fmt *fp)
 		break;
 
 	case D_FCONST:
-		snprint(str, sizeof(str), "$(%.17e)", a->dval);
+		snprint(str, sizeof(str), "$(%.17e)", a->u.dval);
 		break;
 
 	case D_SCONST:
-		snprint(str, sizeof(str), "$\"%Y\"", a->sval);
+		snprint(str, sizeof(str), "$\"%Y\"", a->u.sval);
 		break;
 
 	case D_ADDR:
@@ -161,7 +161,10 @@ brk:
 		strcat(str, s);
 	}
 conv:
-	return fmtstrcpy(fp, str);
+	fmtstrcpy(fp, str);
+	if(a->gotype)
+		fmtprint(fp, "{%s}", a->gotype->name);
+	return 0;
 }
 
 static	char*	regstr[] =
diff --git a/src/cmd/6g/opt.h b/src/cmd/6g/opt.h
index 9a8866b8d..9b0ea1b5a 100644
--- a/src/cmd/6g/opt.h
+++ b/src/cmd/6g/opt.h
@@ -34,8 +34,6 @@
 #define	D_HI	D_NONE
 #define	D_LO	D_NONE
 
-#define	isregtype(t)	((t)>= D_AX && (t)<=D_R15)
-
 #define	BLOAD(r)	band(bnot(r->refbehind), r->refahead)
 #define	BSTORE(r)	band(bnot(r->calbehind), r->calahead)
 #define	LOAD(r)		(~r->refbehind.b[z] & r->refahead.b[z])
@@ -49,12 +47,16 @@
 typedef	struct	Reg	Reg;
 typedef	struct	Rgn	Rgn;
 
+// A Reg is a wrapper around a single Prog (one instruction) that holds
+// register optimization information while the optimizer runs.
+// r->prog is the instruction.
+// r->prog->regp points back to r.
 struct	Reg
 {
 
-	Bits	set;
-	Bits	use1;
-	Bits	use2;
+	Bits	set;  		// variables written by this instruction.
+	Bits	use1; 		// variables read by prog->from.
+	Bits	use2; 		// variables read by prog->to.
 
 	Bits	refbehind;
 	Bits	refahead;
@@ -70,13 +72,13 @@ struct	Reg
 	uint16	loop;		// x5 for every loop
 	uchar	refset;		// diagnostic generated
 
-	Reg*	p1;
-	Reg*	p2;
+	Reg*	p1;     	// predecessors of this instruction: p1,
+	Reg*	p2;     	// and then p2 linked though p2link.
 	Reg*	p2link;
-	Reg*	s1;
+	Reg*	s1;     	// successors of this instruction (at most two: s1 and s2).
 	Reg*	s2;
-	Reg*	link;
-	Prog*	prog;
+	Reg*	link;   	// next instruction in function code
+	Prog*	prog;   	// actual instruction
 };
 #define	R	((Reg*)0)
 
diff --git a/src/cmd/6g/peep.c b/src/cmd/6g/peep.c
index 3710033b2..569655786 100644
--- a/src/cmd/6g/peep.c
+++ b/src/cmd/6g/peep.c
@@ -34,6 +34,10 @@
 #include "opt.h"
 
 static void	conprop(Reg *r);
+static void elimshortmov(Reg *r);
+static int prevl(Reg *r, int reg);
+static void pushback(Reg *r);
+static int regconsttyp(Adr*);
 
 // do we need the carry bit
 static int
@@ -45,11 +49,17 @@ needc(Prog *p)
 		case AADCQ:
 		case ASBBL:
 		case ASBBQ:
+		case ARCRB:
+		case ARCRW:
 		case ARCRL:
 		case ARCRQ:
 			return 1;
+		case AADDB:
+		case AADDW:
 		case AADDL:
 		case AADDQ:
+		case ASUBB:
+		case ASUBW:
 		case ASUBL:
 		case ASUBQ:
 		case AJMP:
@@ -122,9 +132,14 @@ peep(void)
 		case AGLOBL:
 		case ANAME:
 		case ASIGNAME:
+		case ALOCALS:
+		case ATYPE:
 			p = p->link;
 		}
 	}
+	
+	// byte, word arithmetic elimination.
+	elimshortmov(r);
 
 	// constant propagation
 	// find MOV $con,R followed by
@@ -200,6 +215,7 @@ loop1:
 		case AMOVWQZX:
 		case AMOVLQSX:
 		case AMOVLQZX:
+		case AMOVQL:
 			if(regtyp(&p->to)) {
 				r1 = rnops(uniqs(r));
 				if(r1 != R) {
@@ -272,6 +288,115 @@ loop1:
 	}
 	if(t)
 		goto loop1;
+
+	// MOVLQZX removal.
+	// The MOVLQZX exists to avoid being confused for a
+	// MOVL that is just copying 32-bit data around during
+	// copyprop.  Now that copyprop is done, remov MOVLQZX R1, R2
+	// if it is dominated by an earlier ADDL/MOVL/etc into R1 that
+	// will have already cleared the high bits.
+	//
+	// MOVSD removal.
+	// We never use packed registers, so a MOVSD between registers
+	// can be replaced by MOVAPD, which moves the pair of float64s
+	// instead of just the lower one.  We only use the lower one, but
+	// the processor can do better if we do moves using both.
+	for(r=firstr; r!=R; r=r->link) {
+		p = r->prog;
+		if(p->as == AMOVLQZX)
+		if(regtyp(&p->from))
+		if(p->from.type == p->to.type)
+		if(prevl(r, p->from.type))
+			excise(r);
+		
+		if(p->as == AMOVSD)
+		if(regtyp(&p->from))
+		if(regtyp(&p->to))
+			p->as = AMOVAPD;
+	}
+
+	// load pipelining
+	// push any load from memory as early as possible
+	// to give it time to complete before use.
+	for(r=firstr; r!=R; r=r->link) {
+		p = r->prog;
+		switch(p->as) {
+		case AMOVB:
+		case AMOVW:
+		case AMOVL:
+		case AMOVQ:
+		case AMOVLQZX:
+			if(regtyp(&p->to) && !regconsttyp(&p->from))
+				pushback(r);
+		}
+	}
+}
+
+static void
+pushback(Reg *r0)
+{
+	Reg *r, *b;
+	Prog *p0, *p, t;
+	
+	b = R;
+	p0 = r0->prog;
+	for(r=uniqp(r0); r!=R && uniqs(r)!=R; r=uniqp(r)) {
+		p = r->prog;
+		if(p->as != ANOP) {
+			if(!regconsttyp(&p->from) || !regtyp(&p->to))
+				break;
+			if(copyu(p, &p0->to, A) || copyu(p0, &p->to, A))
+				break;
+		}
+		if(p->as == ACALL)
+			break;
+		b = r;
+	}
+	
+	if(b == R) {
+		if(debug['v']) {
+			print("no pushback: %P\n", r0->prog);
+			if(r)
+				print("\t%P [%d]\n", r->prog, uniqs(r)!=R);
+		}
+		return;
+	}
+
+	if(debug['v']) {
+		print("pushback\n");
+		for(r=b;; r=r->link) {
+			print("\t%P\n", r->prog);
+			if(r == r0)
+				break;
+		}
+	}
+
+	t = *r0->prog;
+	for(r=uniqp(r0);; r=uniqp(r)) {
+		p0 = r->link->prog;
+		p = r->prog;
+		p0->as = p->as;
+		p0->lineno = p->lineno;
+		p0->from = p->from;
+		p0->to = p->to;
+
+		if(r == b)
+			break;
+	}
+	p0 = r->prog;
+	p0->as = t.as;
+	p0->lineno = t.lineno;
+	p0->from = t.from;
+	p0->to = t.to;
+
+	if(debug['v']) {
+		print("\tafter\n");
+		for(r=b;; r=r->link) {
+			print("\t%P\n", r->prog);
+			if(r == r0)
+				break;
+		}
+	}
 }
 
 void
@@ -335,6 +460,155 @@ regtyp(Adr *a)
 	return 0;
 }
 
+// movb elimination.
+// movb is simulated by the linker
+// when a register other than ax, bx, cx, dx
+// is used, so rewrite to other instructions
+// when possible.  a movb into a register
+// can smash the entire 32-bit register without
+// causing any trouble.
+static void
+elimshortmov(Reg *r)
+{
+	Prog *p;
+
+	for(r=firstr; r!=R; r=r->link) {
+		p = r->prog;
+		if(regtyp(&p->to)) {
+			switch(p->as) {
+			case AINCB:
+			case AINCW:
+				p->as = AINCQ;
+				break;
+			case ADECB:
+			case ADECW:
+				p->as = ADECQ;
+				break;
+			case ANEGB:
+			case ANEGW:
+				p->as = ANEGQ;
+				break;
+			case ANOTB:
+			case ANOTW:
+				p->as = ANOTQ;
+				break;
+			}
+			if(regtyp(&p->from) || p->from.type == D_CONST) {
+				// move or artihmetic into partial register.
+				// from another register or constant can be movl.
+				// we don't switch to 64-bit arithmetic if it can
+				// change how the carry bit is set (and the carry bit is needed).
+				switch(p->as) {
+				case AMOVB:
+				case AMOVW:
+					p->as = AMOVQ;
+					break;
+				case AADDB:
+				case AADDW:
+					if(!needc(p->link))
+						p->as = AADDQ;
+					break;
+				case ASUBB:
+				case ASUBW:
+					if(!needc(p->link))
+						p->as = ASUBQ;
+					break;
+				case AMULB:
+				case AMULW:
+					p->as = AMULQ;
+					break;
+				case AIMULB:
+				case AIMULW:
+					p->as = AIMULQ;
+					break;
+				case AANDB:
+				case AANDW:
+					p->as = AANDQ;
+					break;
+				case AORB:
+				case AORW:
+					p->as = AORQ;
+					break;
+				case AXORB:
+				case AXORW:
+					p->as = AXORQ;
+					break;
+				case ASHLB:
+				case ASHLW:
+					p->as = ASHLQ;
+					break;
+				}
+			} else if(p->from.type >= D_NONE) {
+				// explicit zero extension, but don't
+				// do that if source is a byte register
+				// (only AH can occur and it's forbidden).
+				switch(p->as) {
+				case AMOVB:
+					p->as = AMOVBQZX;
+					break;
+				case AMOVW:
+					p->as = AMOVWQZX;
+					break;
+				}
+			}
+		}
+	}
+}
+
+int
+regconsttyp(Adr *a)
+{
+	if(regtyp(a))
+		return 1;
+	switch(a->type) {
+	case D_CONST:
+	case D_FCONST:
+	case D_SCONST:
+	case D_ADDR:
+		return 1;
+	}
+	return 0;
+}
+
+// is reg guaranteed to be truncated by a previous L instruction?
+static int
+prevl(Reg *r0, int reg)
+{
+	Prog *p;
+	Reg *r;
+
+	for(r=uniqp(r0); r!=R; r=uniqp(r)) {
+		p = r->prog;
+		if(p->to.type == reg) {
+			switch(p->as) {
+			case AADDL:
+			case AANDL:
+			case ADECL:
+			case ADIVL:
+			case AIDIVL:
+			case AIMULL:
+			case AINCL:
+			case AMOVL:
+			case AMULL:
+			case AORL:
+			case ARCLL:
+			case ARCRL:
+			case AROLL:
+			case ARORL:
+			case ASALL:
+			case ASARL:
+			case ASHLL:
+			case ASHRL:
+			case ASUBL:
+			case AXORL:
+				return 1;
+			}
+			return 0;
+		}
+	}
+	return 0;
+}
+
 /*
  * the idea is to substitute
  * one register for another
@@ -357,19 +631,34 @@ subprop(Reg *r0)
 	Reg *r;
 	int t;
 
+	if(debug['P'] && debug['v'])
+		print("subprop %P\n", r0->prog);
 	p = r0->prog;
 	v1 = &p->from;
-	if(!regtyp(v1))
+	if(!regtyp(v1)) {
+		if(debug['P'] && debug['v'])
+			print("\tnot regtype %D; return 0\n", v1);
 		return 0;
+	}
 	v2 = &p->to;
-	if(!regtyp(v2))
+	if(!regtyp(v2)) {
+		if(debug['P'] && debug['v'])
+			print("\tnot regtype %D; return 0\n", v2);
 		return 0;
+	}
 	for(r=uniqp(r0); r!=R; r=uniqp(r)) {
-		if(uniqs(r) == R)
+		if(debug['P'] && debug['v'])
+			print("\t? %P\n", r->prog);
+		if(uniqs(r) == R) {
+			if(debug['P'] && debug['v'])
+				print("\tno unique successor\n");
 			break;
+		}
 		p = r->prog;
 		switch(p->as) {
 		case ACALL:
+			if(debug['P'] && debug['v'])
+				print("\tfound %P; return 0\n", p);
 			return 0;
 
 		case AIMULL:
@@ -377,20 +666,7 @@ subprop(Reg *r0)
 		case AIMULW:
 			if(p->to.type != D_NONE)
 				break;
-
-		case ADIVB:
-		case ADIVL:
-		case ADIVQ:
-		case ADIVW:
-		case AIDIVB:
-		case AIDIVL:
-		case AIDIVQ:
-		case AIDIVW:
-		case AIMULB:
-		case AMULB:
-		case AMULL:
-		case AMULQ:
-		case AMULW:
+			goto giveup;
 
 		case ARCLB:
 		case ARCLL:
@@ -424,6 +700,23 @@ subprop(Reg *r0)
 		case ASHRL:
 		case ASHRQ:
 		case ASHRW:
+			if(p->from.type == D_CONST)
+				break;
+			goto giveup;
+
+		case ADIVB:
+		case ADIVL:
+		case ADIVQ:
+		case ADIVW:
+		case AIDIVB:
+		case AIDIVL:
+		case AIDIVQ:
+		case AIDIVW:
+		case AIMULB:
+		case AMULB:
+		case AMULL:
+		case AMULQ:
+		case AMULW:
 
 		case AREP:
 		case AREPN:
@@ -438,21 +731,34 @@ subprop(Reg *r0)
 		case AMOVSB:
 		case AMOVSL:
 		case AMOVSQ:
+		giveup:
+			if(debug['P'] && debug['v'])
+				print("\tfound %P; return 0\n", p);
 			return 0;
 
 		case AMOVL:
 		case AMOVQ:
+		case AMOVSS:
+		case AMOVSD:
 			if(p->to.type == v1->type)
 				goto gotit;
 			break;
 		}
 		if(copyau(&p->from, v2) ||
-		   copyau(&p->to, v2))
+		   copyau(&p->to, v2)) {
+		   	if(debug['P'] && debug['v'])
+		   		print("\tcopyau %D failed\n", v2);
 			break;
+		}
 		if(copysub(&p->from, v1, v2, 0) ||
-		   copysub(&p->to, v1, v2, 0))
+		   copysub(&p->to, v1, v2, 0)) {
+		   	if(debug['P'] && debug['v'])
+		   		print("\tcopysub failed\n");
 			break;
+		}
 	}
+	if(debug['P'] && debug['v'])
+		print("\tran off end; return 0\n", p);
 	return 0;
 
 gotit:
@@ -497,6 +803,8 @@ copyprop(Reg *r0)
 	Adr *v1, *v2;
 	Reg *r;
 
+	if(debug['P'] && debug['v'])
+		print("copyprop %P\n", r0->prog);
 	p = r0->prog;
 	v1 = &p->from;
 	v2 = &p->to;
@@ -636,6 +944,7 @@ copyu(Prog *p, Adr *v, Adr *s)
 	case AMOVWLZX:
 	case AMOVWQSX:
 	case AMOVWQZX:
+	case AMOVQL:
 
 	case AMOVSS:
 	case AMOVSD:
@@ -853,8 +1162,6 @@ copyu(Prog *p, Adr *v, Adr *s)
 		return 0;
 
 	case ARET:	/* funny */
-		if(v->type == REGRET || v->type == FREGRET)
-			return 2;
 		if(s != A)
 			return 1;
 		return 3;
@@ -864,6 +1171,8 @@ copyu(Prog *p, Adr *v, Adr *s)
 			return 2;
 		if(REGARG >= 0 && v->type == (uchar)REGARG)
 			return 2;
+		if(v->type == p->from.type)
+			return 2;
 
 		if(s != A) {
 			if(copysub(&p->to, v, s, 1))
@@ -907,13 +1216,22 @@ int
 copyau(Adr *a, Adr *v)
 {
 
-	if(copyas(a, v))
+	if(copyas(a, v)) {
+		if(debug['P'] && debug['v'])
+			print("\tcopyau: copyas returned 1\n");
 		return 1;
+	}
 	if(regtyp(v)) {
-		if(a->type-D_INDIR == v->type)
+		if(a->type-D_INDIR == v->type) {
+			if(debug['P'] && debug['v'])
+				print("\tcopyau: found indir use - return 1\n");
 			return 1;
-		if(a->index == v->type)
+		}
+		if(a->index == v->type) {
+			if(debug['P'] && debug['v'])
+				print("\tcopyau: found index use - return 1\n");
 			return 1;
+		}
 	}
 	return 0;
 }
@@ -990,7 +1308,7 @@ loop:
 		if(p->from.node == p0->from.node)
 		if(p->from.offset == p0->from.offset)
 		if(p->from.scale == p0->from.scale)
-		if(p->from.dval == p0->from.dval)
+		if(p->from.u.vval == p0->from.u.vval)
 		if(p->from.index == p0->from.index) {
 			excise(r);
 			goto loop;
diff --git a/src/cmd/6g/reg.c b/src/cmd/6g/reg.c
index 049c63f17..c56d71678 100644
--- a/src/cmd/6g/reg.c
+++ b/src/cmd/6g/reg.c
@@ -151,6 +151,8 @@ static char* regname[] = {
 	".X15",
 };
 
+static Node* regnodes[NREGVAR];
+
 static void fixjmp(Prog*);
 
 void
@@ -164,7 +166,7 @@ regopt(Prog *firstp)
 
 	if(first) {
 		fmtinstall('Q', Qconv);
-		exregoffset = D_R13;	// R14,R15 are external
+		exregoffset = D_R15;
 		first = 0;
 	}
 
@@ -191,8 +193,11 @@ regopt(Prog *firstp)
 	 */
 	nvar = NREGVAR;
 	memset(var, 0, NREGVAR*sizeof var[0]);
-	for(i=0; i<NREGVAR; i++)
-		var[i].node = newname(lookup(regname[i]));
+	for(i=0; i<NREGVAR; i++) {
+		if(regnodes[i] == N)
+			regnodes[i] = newname(lookup(regname[i]));
+		var[i].node = regnodes[i];
+	}
 
 	regbits = RtoB(D_SP);
 	for(z=0; z<BITS; z++) {
@@ -219,6 +224,8 @@ regopt(Prog *firstp)
 		case AGLOBL:
 		case ANAME:
 		case ASIGNAME:
+		case ALOCALS:
+		case ATYPE:
 			continue;
 		}
 		r = rega();
@@ -247,6 +254,20 @@ regopt(Prog *firstp)
 			}
 		}
 
+		// Avoid making variables for direct-called functions.
+		if(p->as == ACALL && p->to.type == D_EXTERN)
+			continue;
+
+		// Addressing makes some registers used.
+		if(p->from.type >= D_INDIR)
+			r->use1.b[0] |= RtoB(p->from.type-D_INDIR);
+		if(p->from.index != D_NONE)
+			r->use1.b[0] |= RtoB(p->from.index);
+		if(p->to.type >= D_INDIR)
+			r->use2.b[0] |= RtoB(p->to.type-D_INDIR);
+		if(p->to.index != D_NONE)
+			r->use2.b[0] |= RtoB(p->to.index);
+
 		bit = mkvar(r, &p->from);
 		if(bany(&bit))
 		switch(p->as) {
@@ -326,6 +347,7 @@ regopt(Prog *firstp)
 		case AMOVWLZX:
 		case AMOVWQSX:
 		case AMOVWQZX:
+		case AMOVQL:
 		case APOPQ:
 
 		case AMOVSS:
@@ -578,8 +600,9 @@ regopt(Prog *firstp)
 				addrs.b[z] |= bit.b[z];
 		}
 
-//		print("bit=%2d addr=%d et=%-6E w=%-2d s=%S + %lld\n",
-//			i, v->addr, v->etype, v->width, v->sym, v->offset);
+		if(debug['R'] && debug['v'])
+			print("bit=%2d addr=%d et=%-6E w=%-2d s=%N + %lld\n",
+				i, v->addr, v->etype, v->width, v->node, v->offset);
 	}
 
 	if(debug['R'] && debug['v'])
@@ -593,9 +616,9 @@ regopt(Prog *firstp)
 	for(r=firstr; r!=R; r=r->link) {
 		p = r->prog;
 		if(p->to.type == D_BRANCH) {
-			if(p->to.branch == P)
+			if(p->to.u.branch == P)
 				fatal("pnil %P", p);
-			r1 = p->to.branch->reg;
+			r1 = p->to.u.branch->reg;
 			if(r1 == R)
 				fatal("rnil %P", p);
 			if(r1 == r) {
@@ -742,6 +765,9 @@ loop2:
 brk:
 	qsort(region, nregion, sizeof(region[0]), rcmp);
 
+	if(debug['R'] && debug['v'])
+		dumpit("pass5", firstr);
+
 	/*
 	 * pass 6
 	 * determine used registers (paint2)
@@ -752,8 +778,16 @@ brk:
 		bit = blsh(rgp->varno);
 		vreg = paint2(rgp->enter, rgp->varno);
 		vreg = allreg(vreg, rgp);
-		if(rgp->regno != 0)
+		if(rgp->regno != 0) {
+			if(debug['R'] && debug['v']) {
+				Var *v;
+
+				v = var + rgp->varno;
+				print("registerize %N+%d (bit=%2d et=%2E) in %R\n",
+						v->node, v->offset, rgp->varno, v->etype, rgp->regno);
+			}
 			paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
+		}
 		rgp++;
 	}
 
@@ -776,8 +810,8 @@ brk:
 		while(p->link != P && p->link->as == ANOP)
 			p->link = p->link->link;
 		if(p->to.type == D_BRANCH)
-			while(p->to.branch != P && p->to.branch->as == ANOP)
-				p->to.branch = p->to.branch->link;
+			while(p->to.u.branch != P && p->to.u.branch->as == ANOP)
+				p->to.u.branch = p->to.u.branch->link;
 	}
 
 	if(lastr != R) {
@@ -838,7 +872,6 @@ addmove(Reg *r, int bn, int rn, int f)
 	a->offset = v->offset;
 	a->etype = v->etype;
 	a->type = v->name;
-	a->gotype = v->gotype;
 	a->node = v->node;
 	a->sym = v->node->sym;
 
@@ -847,7 +880,7 @@ addmove(Reg *r, int bn, int rn, int f)
 	p1->as = AMOVL;
 	switch(v->etype) {
 	default:
-		fatal("unknown type\n");
+		fatal("unknown type %E", v->etype);
 	case TINT8:
 	case TUINT8:
 	case TBOOL:
@@ -932,7 +965,8 @@ Bits
 mkvar(Reg *r, Adr *a)
 {
 	Var *v;
-	int i, t, n, et, z, w, flag;
+	int i, t, n, et, z, flag;
+	int64 w;
 	uint32 regu;
 	int32 o;
 	Bits bit;
@@ -984,6 +1018,8 @@ mkvar(Reg *r, Adr *a)
 	et = a->etype;
 	o = a->offset;
 	w = a->width;
+	if(w < 0)
+		fatal("bad width %lld for %D", w, a);
 
 	flag = 0;
 	for(i=0; i<nvar; i++) {
@@ -1019,14 +1055,14 @@ mkvar(Reg *r, Adr *a)
 	v = var+i;
 	v->offset = o;
 	v->name = n;
-	v->gotype = a->gotype;
 	v->etype = et;
 	v->width = w;
 	v->addr = flag;		// funny punning
 	v->node = node;
 
 	if(debug['R'])
-		print("bit=%2d et=%2d w=%d %#N %D\n", i, et, w, node, a);
+		print("bit=%2d et=%2E w=%d+%d %#N %D flag=%d\n", i, et, o, w, node, a, v->addr);
+
 	ostats.nvar++;
 
 	bit = blsh(i);
@@ -1089,8 +1125,12 @@ prop(Reg *r, Bits ref, Bits cal)
 		default:
 			// Work around for issue 1304:
 			// flush modified globals before each instruction.
-			for(z=0; z<BITS; z++)
+			for(z=0; z<BITS; z++) {
 				cal.b[z] |= externs.b[z];
+				// issue 4066: flush modified return variables in case of panic
+				if(hasdefer)
+					cal.b[z] |= ovar.b[z];
+			}
 			break;
 		}
 		for(z=0; z<BITS; z++) {
@@ -1576,7 +1616,7 @@ RtoB(int r)
 int
 BtoR(int32 b)
 {
-	b &= 0x3fffL;		// no R14 or R15
+	b &= 0xffffL;
 	if(b == 0)
 		return 0;
 	return bitno(b) + D_AX;
@@ -1584,26 +1624,26 @@ BtoR(int32 b)
 
 /*
  *	bit	reg
- *	16	X5 (FREGMIN)
+ *	16	X0
  *	...
- *	26	X15 (FREGEXT)
+ *	31	X15
  */
 int32
 FtoB(int f)
 {
-	if(f < FREGMIN || f > FREGEXT)
+	if(f < D_X0 || f > D_X15)
 		return 0;
-	return 1L << (f - FREGMIN + 16);
+	return 1L << (f - D_X0 + 16);
 }
 
 int
 BtoF(int32 b)
 {
 
-	b &= 0xFF0000L;
+	b &= 0xFFFF0000L;
 	if(b == 0)
 		return 0;
-	return bitno(b) - 16 + FREGMIN;
+	return bitno(b) - 16 + D_X0;
 }
 
 void
@@ -1719,7 +1759,7 @@ chasejmp(Prog *p, int *jmploop)
 			*jmploop = 1;
 			break;
 		}
-		p = p->to.branch;
+		p = p->to.u.branch;
 	}
 	return p;
 }
@@ -1741,9 +1781,9 @@ mark(Prog *firstp)
 		if(p->reg != dead)
 			break;
 		p->reg = alive;
-		if(p->as != ACALL && p->to.type == D_BRANCH && p->to.branch)
-			mark(p->to.branch);
-		if(p->as == AJMP || p->as == ARET || (p->as == ACALL && noreturn(p)))
+		if(p->as != ACALL && p->to.type == D_BRANCH && p->to.u.branch)
+			mark(p->to.u.branch);
+		if(p->as == AJMP || p->as == ARET || p->as == AUNDEF)
 			break;
 	}
 }
@@ -1762,8 +1802,8 @@ fixjmp(Prog *firstp)
 	for(p=firstp; p; p=p->link) {
 		if(debug['R'] && debug['v'])
 			print("%P\n", p);
-		if(p->as != ACALL && p->to.type == D_BRANCH && p->to.branch && p->to.branch->as == AJMP) {
-			p->to.branch = chasejmp(p->to.branch, &jmploop);
+		if(p->as != ACALL && p->to.type == D_BRANCH && p->to.u.branch && p->to.u.branch->as == AJMP) {
+			p->to.u.branch = chasejmp(p->to.u.branch, &jmploop);
 			if(debug['R'] && debug['v'])
 				print("->%P\n", p);
 		}
@@ -1799,7 +1839,7 @@ fixjmp(Prog *firstp)
 	if(!jmploop) {
 		last = nil;
 		for(p=firstp; p; p=p->link) {
-			if(p->as == AJMP && p->to.type == D_BRANCH && p->to.branch == p->link) {
+			if(p->as == AJMP && p->to.type == D_BRANCH && p->to.u.branch == p->link) {
 				if(debug['R'] && debug['v'])
 					print("del %P\n", p);
 				continue;