1 files changed, 407 insertions, 434 deletions
diff --git a/src/cmd/8g/ggen.c b/src/cmd/8g/ggen.c
index 6a4570199..70148106c 100644
--- a/src/cmd/8g/ggen.c
+++ b/src/cmd/8g/ggen.c
@@ -27,6 +27,9 @@ void
 markautoused(Prog* p)
 {
 	for (; p; p = p->link) {
+		if (p->as == ATYPE)
+			continue;
+
 		if (p->from.type == D_AUTO && p->from.node)
 			p->from.node->used = 1;
 
@@ -39,12 +42,21 @@ markautoused(Prog* p)
 void
 fixautoused(Prog* p)
 {
-	for (; p; p = p->link) {
+	Prog **lp;
+
+	for (lp=&p; (p=*lp) != P; ) {
+		if (p->as == ATYPE && p->from.node && p->from.type == D_AUTO && !p->from.node->used) {
+			*lp = p->link;
+			continue;
+		}
+
 		if (p->from.type == D_AUTO && p->from.node)
 			p->from.offset += p->from.node->stkdelta;
 
 		if (p->to.type == D_AUTO && p->to.node)
 			p->to.offset += p->to.node->stkdelta;
+
+		lp = &p->link;
 	}
 }
 
@@ -59,6 +71,10 @@ clearfat(Node *nl)
 		dump("\nclearfat", nl);
 
 	w = nl->type->width;
+	// Avoid taking the address for simple enough types.
+	if(componentgen(N, nl))
+		return;
+
 	c = w % 4;	// bytes
 	q = w / 4;	// quads
 
@@ -90,15 +106,17 @@ clearfat(Node *nl)
 /*
  * generate:
  *	call f
+ *	proc=-1	normal call but no return
  *	proc=0	normal call
  *	proc=1	goroutine run in new proc
  *	proc=2	defer call save away stack
+  *	proc=3	normal call to C pointer (not Go func value)
  */
 void
 ginscall(Node *f, int proc)
 {
 	Prog *p;
-	Node reg, con;
+	Node reg, r1, con;
 
 	switch(proc) {
 	default:
@@ -106,8 +124,25 @@ ginscall(Node *f, int proc)
 		break;
 
 	case 0:	// normal call
-		p = gins(ACALL, N, f);
-		afunclit(&p->to);
+	case -1:	// normal call but no return
+		if(f->op == ONAME && f->class == PFUNC) {
+			p = gins(ACALL, N, f);
+			afunclit(&p->to, f);
+			if(proc == -1 || noreturn(p))
+				gins(AUNDEF, N, N);
+			break;
+		}
+		nodreg(&reg, types[tptr], D_DX);
+		nodreg(&r1, types[tptr], D_BX);
+		gmove(f, &reg);
+		reg.op = OINDREG;
+		gmove(&reg, &r1);
+		reg.op = OREGISTER;
+		gins(ACALL, &reg, &r1);
+		break;
+	
+	case 3:	// normal call of c function pointer
+		gins(ACALL, N, f);
 		break;
 
 	case 1:	// call in new proc (go)
@@ -125,7 +160,7 @@ ginscall(Node *f, int proc)
 		if(proc == 2) {
 			nodreg(&reg, types[TINT64], D_AX);
 			gins(ATESTL, &reg, &reg);
-			patch(gbranch(AJNE, T), retpc);
+			patch(gbranch(AJNE, T, -1), retpc);
 		}
 		break;
 	}
@@ -139,7 +174,7 @@ void
 cgen_callinter(Node *n, Node *res, int proc)
 {
 	Node *i, *f;
-	Node tmpi, nodo, nodr, nodsp;
+	Node tmpi, nodi, nodo, nodr, nodsp;
 
 	i = n->left;
 	if(i->op != ODOTINTER)
@@ -159,25 +194,35 @@ cgen_callinter(Node *n, Node *res, int proc)
 
 	genlist(n->list);		// assign the args
 
-	// Can regalloc now; i is known to be addable,
-	// so the agen will be easy.
-	regalloc(&nodr, types[tptr], res);
-	regalloc(&nodo, types[tptr], &nodr);
-	nodo.op = OINDREG;
-
-	agen(i, &nodr);		// REG = &inter
+	// i is now addable, prepare an indirected
+	// register to hold its address.
+	igen(i, &nodi, res);		// REG = &inter
 
 	nodindreg(&nodsp, types[tptr], D_SP);
-	nodo.xoffset += widthptr;
-	cgen(&nodo, &nodsp);	// 0(SP) = 4(REG) -- i.data
+	nodi.type = types[tptr];
+	nodi.xoffset += widthptr;
+	cgen(&nodi, &nodsp);	// 0(SP) = 4(REG) -- i.data
 
-	nodo.xoffset -= widthptr;
-	cgen(&nodo, &nodr);	// REG = 0(REG) -- i.tab
+	regalloc(&nodo, types[tptr], res);
+	nodi.type = types[tptr];
+	nodi.xoffset -= widthptr;
+	cgen(&nodi, &nodo);	// REG = 0(REG) -- i.tab
+	regfree(&nodi);
 
+	regalloc(&nodr, types[tptr], &nodo);
 	if(n->left->xoffset == BADWIDTH)
 		fatal("cgen_callinter: badwidth");
+	nodo.op = OINDREG;
 	nodo.xoffset = n->left->xoffset + 3*widthptr + 8;
-	cgen(&nodo, &nodr);	// REG = 20+offset(REG) -- i.tab->fun[f]
+	
+	if(proc == 0) {
+		// plain call: use direct c function pointer - more efficient
+		cgen(&nodo, &nodr);	// REG = 20+offset(REG) -- i.tab->fun[f]
+		proc = 3;
+	} else {
+		// go/defer. generate go func value.
+		gins(ALEAL, &nodo, &nodr);	// REG = &(20+offset(REG)) -- i.tab->fun[f]
+	}
 
 	// BOTCH nodr.type = fntype;
 	nodr.type = n->left->type;
@@ -486,7 +531,7 @@ dodiv(int op, Node *nl, Node *nr, Node *res, Node *ax, Node *dx)
 	int check;
 	Node n1, t1, t2, t3, t4, n4, nz;
 	Type *t, *t0;
-	Prog *p1, *p2, *p3;
+	Prog *p1, *p2;
 
 	// Have to be careful about handling
 	// most negative int divided by -1 correctly.
@@ -535,23 +580,22 @@ dodiv(int op, Node *nl, Node *nr, Node *res, Node *ax, Node *dx)
 		regalloc(&n1, t, N);
 	gmove(&t2, &n1);
 	gmove(&t1, ax);
-	p3 = P;
+	p2 = P;
 	if(check) {
 		nodconst(&n4, t, -1);
 		gins(optoas(OCMP, t), &n1, &n4);
-		p1 = gbranch(optoas(ONE, t), T);
-		nodconst(&n4, t, -1LL<<(t->width*8-1));
-		gins(optoas(OCMP, t), ax, &n4);
-		p2 = gbranch(optoas(ONE, t), T);
-		if(op == ODIV)
-			gmove(&n4, res);
-		if(op == OMOD) {
+		p1 = gbranch(optoas(ONE, t), T, +1);
+		if(op == ODIV) {
+			// a / (-1) is -a.
+			gins(optoas(OMINUS, t), N, ax);
+			gmove(ax, res);
+		} else {
+			// a % (-1) is 0.
 			nodconst(&n4, t, 0);
 			gmove(&n4, res);
 		}
-		p3 = gbranch(AJMP, T);
+		p2 = gbranch(AJMP, T, 0);
 		patch(p1, pc);
-		patch(p2, pc);
 	}
 	if(!issigned[t->etype]) {
 		nodconst(&nz, t, 0);
@@ -566,7 +610,7 @@ dodiv(int op, Node *nl, Node *nr, Node *res, Node *ax, Node *dx)
 	else
 		gmove(dx, res);
 	if(check)
-		patch(p3, pc);
+		patch(p2, pc);
 }
 
 static void
@@ -630,7 +674,7 @@ cgen_div(int op, Node *nl, Node *nr, Node *res)
  *	res = nl >> nr
  */
 void
-cgen_shift(int op, Node *nl, Node *nr, Node *res)
+cgen_shift(int op, int bounded, Node *nl, Node *nr, Node *res)
 {
 	Node n1, n2, nt, cx, oldcx, hi, lo;
 	int a, w;
@@ -651,7 +695,7 @@ cgen_shift(int op, Node *nl, Node *nr, Node *res)
 		gmove(&n2, &n1);
 		sc = mpgetfix(nr->val.u.xval);
 		if(sc >= nl->type->width*8) {
-			// large shift gets 2 shifts by width
+			// large shift gets 2 shifts by width-1
 			gins(a, ncon(w-1), &n1);
 			gins(a, ncon(w-1), &n1);
 		} else
@@ -689,27 +733,39 @@ cgen_shift(int op, Node *nl, Node *nr, Node *res)
 	}
 
 	// test and fix up large shifts
-	if(nr->type->width > 4) {
-		// delayed reg alloc
-		nodreg(&n1, types[TUINT32], D_CX);
-		regalloc(&n1, types[TUINT32], &n1);		// to hold the shift type in CX
-		split64(&nt, &lo, &hi);
-		gmove(&lo, &n1);
-		gins(optoas(OCMP, types[TUINT32]), &hi, ncon(0));
-		p2 = gbranch(optoas(ONE, types[TUINT32]), T);
-		gins(optoas(OCMP, types[TUINT32]), &n1, ncon(w));
-		p1 = gbranch(optoas(OLT, types[TUINT32]), T);
-		patch(p2, pc);
-	} else {
-		gins(optoas(OCMP, nr->type), &n1, ncon(w));
-		p1 = gbranch(optoas(OLT, types[TUINT32]), T);
-	}
-	if(op == ORSH && issigned[nl->type->etype]) {
-		gins(a, ncon(w-1), &n2);
+	if(bounded) {
+		if(nr->type->width > 4) {
+			// delayed reg alloc
+			nodreg(&n1, types[TUINT32], D_CX);
+			regalloc(&n1, types[TUINT32], &n1);		// to hold the shift type in CX
+			split64(&nt, &lo, &hi);
+			gmove(&lo, &n1);
+			splitclean();
+		}
 	} else {
-		gmove(ncon(0), &n2);
+		if(nr->type->width > 4) {
+			// delayed reg alloc
+			nodreg(&n1, types[TUINT32], D_CX);
+			regalloc(&n1, types[TUINT32], &n1);		// to hold the shift type in CX
+			split64(&nt, &lo, &hi);
+			gmove(&lo, &n1);
+			gins(optoas(OCMP, types[TUINT32]), &hi, ncon(0));
+			p2 = gbranch(optoas(ONE, types[TUINT32]), T, +1);
+			gins(optoas(OCMP, types[TUINT32]), &n1, ncon(w));
+			p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
+			splitclean();
+			patch(p2, pc);
+		} else {
+			gins(optoas(OCMP, nr->type), &n1, ncon(w));
+			p1 = gbranch(optoas(OLT, types[TUINT32]), T, +1);
+		}
+		if(op == ORSH && issigned[nl->type->etype]) {
+			gins(a, ncon(w-1), &n2);
+		} else {
+			gmove(ncon(0), &n2);
+		}
+		patch(p1, pc);
 	}
-	patch(p1, pc);
 	gins(a, &n1, &n2);
 
 	if(oldcx.op != 0)
@@ -724,444 +780,361 @@ cgen_shift(int op, Node *nl, Node *nr, Node *res)
 /*
  * generate byte multiply:
  *	res = nl * nr
- * no byte multiply instruction so have to do
- * 16-bit multiply and take bottom half.
+ * there is no 2-operand byte multiply instruction so
+ * we do a full-width multiplication and truncate afterwards.
  */
 void
 cgen_bmul(int op, Node *nl, Node *nr, Node *res)
 {
-	Node n1b, n2b, n1w, n2w;
+	Node n1, n2, nt, *tmp;
 	Type *t;
 	int a;
 
-	if(nl->ullman >= nr->ullman) {
-		regalloc(&n1b, nl->type, res);
-		cgen(nl, &n1b);
-		regalloc(&n2b, nr->type, N);
-		cgen(nr, &n2b);
-	} else {
-		regalloc(&n2b, nr->type, N);
-		cgen(nr, &n2b);
-		regalloc(&n1b, nl->type, res);
-		cgen(nl, &n1b);
-	}
-
-	// copy from byte to short registers
-	t = types[TUINT16];
+	// copy from byte to full registers
+	t = types[TUINT32];
 	if(issigned[nl->type->etype])
-		t = types[TINT16];
-
-	regalloc(&n2w, t, &n2b);
-	cgen(&n2b, &n2w);
+		t = types[TINT32];
 
-	regalloc(&n1w, t, &n1b);
-	cgen(&n1b, &n1w);
+	// largest ullman on left.
+	if(nl->ullman < nr->ullman) {
+		tmp = nl;
+		nl = nr;
+		nr = tmp;
+	}
 
+	tempname(&nt, nl->type);
+	cgen(nl, &nt);
+	regalloc(&n1, t, res);
+	cgen(nr, &n1);
+	regalloc(&n2, t, N);
+	gmove(&nt, &n2);
 	a = optoas(op, t);
-	gins(a, &n2w, &n1w);
-	cgen(&n1w, &n1b);
-	cgen(&n1b, res);
-
-	regfree(&n1w);
-	regfree(&n2w);
-	regfree(&n1b);
-	regfree(&n2b);
+	gins(a, &n2, &n1);
+	regfree(&n2);
+	gmove(&n1, res);
+	regfree(&n1);
 }
 
-static int
-regcmp(const void *va, const void *vb)
+/*
+ * generate high multiply:
+ *   res = (nl*nr) >> width
+ */
+void
+cgen_hmul(Node *nl, Node *nr, Node *res)
 {
-	Node *ra, *rb;
-
-	ra = (Node*)va;
-	rb = (Node*)vb;
-	return ra->local - rb->local;
-}
+	Type *t;
+	int a;
+	Node n1, n2, ax, dx;
 
-static	Prog*	throwpc;
+	t = nl->type;
+	a = optoas(OHMUL, t);
+	// gen nl in n1.
+	tempname(&n1, t);
+	cgen(nl, &n1);
+	// gen nr in n2.
+	regalloc(&n2, t, res);
+	cgen(nr, &n2);
+
+	// multiply.
+	nodreg(&ax, t, D_AX);
+	gmove(&n2, &ax);
+	gins(a, &n1, N);
+	regfree(&n2);
 
-// We're only going to bother inlining if we can
-// convert all the arguments to 32 bits safely.  Can we?
-static int
-fix64(NodeList *nn, int n)
-{
-	NodeList *l;
-	Node *r;
-	int i;
-	
-	l = nn;
-	for(i=0; i<n; i++) {
-		r = l->n->right;
-		if(is64(r->type) && !smallintconst(r)) {
-			if(r->op == OCONV)
-				r = r->left;
-			if(is64(r->type))
-				return 0;
-		}
-		l = l->next;
+	if(t->width == 1) {
+		// byte multiply behaves differently.
+		nodreg(&ax, t, D_AH);
+		nodreg(&dx, t, D_DL);
+		gmove(&ax, &dx);
 	}
-	return 1;
+	nodreg(&dx, t, D_DX);
+	gmove(&dx, res);
 }
 
+static void cgen_float387(Node *n, Node *res);
+static void cgen_floatsse(Node *n, Node *res);
+
+/*
+ * generate floating-point operation.
+ */
 void
-getargs(NodeList *nn, Node *reg, int n)
+cgen_float(Node *n, Node *res)
 {
-	NodeList *l;
-	Node *r;
-	int i;
-
-	throwpc = nil;
-
-	l = nn;
-	for(i=0; i<n; i++) {
-		r = l->n->right;
-		if(is64(r->type)) {
-			if(r->op == OCONV)
-				r = r->left;
-			else if(smallintconst(r))
-				r->type = types[TUINT32];
-			if(is64(r->type))
-				fatal("getargs");
+	Node *nl;
+	Node n1, n2;
+	Prog *p1, *p2, *p3;
+
+	nl = n->left;
+	switch(n->op) {
+	case OEQ:
+	case ONE:
+	case OLT:
+	case OLE:
+	case OGE:
+		p1 = gbranch(AJMP, T, 0);
+		p2 = pc;
+		gmove(nodbool(1), res);
+		p3 = gbranch(AJMP, T, 0);
+		patch(p1, pc);
+		bgen(n, 1, 0, p2);
+		gmove(nodbool(0), res);
+		patch(p3, pc);
+		return;
+
+	case OPLUS:
+		cgen(nl, res);
+		return;
+
+	case OCONV:
+		if(eqtype(n->type, nl->type) || noconv(n->type, nl->type)) {
+			cgen(nl, res);
+			return;
 		}
-		if(!smallintconst(r) && !isslice(r->type)) {
-			if(i < 3)	// AX CX DX
-				nodreg(reg+i, r->type, D_AX+i);
-			else
-				reg[i].op = OXXX;
-			regalloc(reg+i, r->type, reg+i);
-			cgen(r, reg+i);
-		} else
-			reg[i] = *r;
-		if(reg[i].local != 0)
-			yyerror("local used");
-		reg[i].local = l->n->left->xoffset;
-		l = l->next;
+
+		tempname(&n2, n->type);
+		mgen(nl, &n1, res);
+		gmove(&n1, &n2);
+		gmove(&n2, res);
+		mfree(&n1);
+		return;
 	}
-	qsort((void*)reg, n, sizeof(*reg), regcmp);
-	for(i=0; i<n; i++)
-		reg[i].local = 0;
+
+	if(use_sse)
+		cgen_floatsse(n, res);
+	else
+		cgen_float387(n, res);
 }
 
-void
-cmpandthrow(Node *nl, Node *nr)
+// floating-point.  387 (not SSE2)
+static void
+cgen_float387(Node *n, Node *res)
 {
-	vlong cl;
-	Prog *p1;
-	int op;
-	Node *c, n1;
-	Type *t;
+	Node f0, f1;
+	Node *nl, *nr;
 
-	op = OLE;
-	if(smallintconst(nl)) {
-		cl = mpgetfix(nl->val.u.xval);
-		if(cl == 0)
-			return;
-		if(smallintconst(nr))
-			return;
-		// put the constant on the right
-		op = brrev(op);
-		c = nl;
-		nl = nr;
-		nr = c;
-	}
-	
-	// Arguments are known not to be 64-bit,
-	// but they might be smaller than 32 bits.
-	// Check if we need to use a temporary.
-	// At least one of the arguments is 32 bits
-	// (the len or cap) so one temporary suffices.
-	n1.op = OXXX;
-	t = types[TUINT32];
-	if(nl->type->width != t->width) {
-		regalloc(&n1, t, nl);
-		gmove(nl, &n1);
-		nl = &n1;
-	} else if(nr->type->width != t->width) {
-		regalloc(&n1, t, nr);
-		gmove(nr, &n1);
-		nr = &n1;
-	}
-	gins(optoas(OCMP, t), nl, nr);
-	if(n1.op != OXXX)
-		regfree(&n1);
-	if(throwpc == nil) {
-		p1 = gbranch(optoas(op, t), T);
-		throwpc = pc;
-		ginscall(panicslice, 0);
-		patch(p1, pc);
+	nl = n->left;
+	nr = n->right;
+	nodreg(&f0, nl->type, D_F0);
+	nodreg(&f1, n->type, D_F0+1);
+	if(nr != N)
+		goto flt2;
+
+	// unary
+	cgen(nl, &f0);
+	if(n->op != OCONV && n->op != OPLUS)
+		gins(foptoas(n->op, n->type, 0), N, N);
+	gmove(&f0, res);
+	return;
+
+flt2:	// binary
+	if(nl->ullman >= nr->ullman) {
+		cgen(nl, &f0);
+		if(nr->addable)
+			gins(foptoas(n->op, n->type, 0), nr, &f0);
+		else {
+			cgen(nr, &f0);
+			gins(foptoas(n->op, n->type, Fpop), &f0, &f1);
+		}
 	} else {
-		op = brcom(op);
-		p1 = gbranch(optoas(op, t), T);
-		patch(p1, throwpc);
+		cgen(nr, &f0);
+		if(nl->addable)
+			gins(foptoas(n->op, n->type, Frev), nl, &f0);
+		else {
+			cgen(nl, &f0);
+			gins(foptoas(n->op, n->type, Frev|Fpop), &f0, &f1);
+		}
 	}
-}
+	gmove(&f0, res);
+	return;
 
-int
-sleasy(Node *n)
-{
-	if(n->op != ONAME)
-		return 0;
-	if(!n->addable)
-		return 0;
-	return 1;
 }
 
-// generate inline code for
-//	slicearray
-//	sliceslice
-//	arraytoslice
-int
-cgen_inline(Node *n, Node *res)
+static void
+cgen_floatsse(Node *n, Node *res)
 {
-	Node nodes[5];
-	Node n1, n2, nres, ntemp;
-	vlong v;
-	int i, narg, nochk;
-
-	if(n->op != OCALLFUNC)
-		goto no;
-	if(!n->left->addable)
-		goto no;
-	if(n->left->sym == S)
-		goto no;
-	if(n->left->sym->pkg != runtimepkg)
-		goto no;
-	if(strcmp(n->left->sym->name, "slicearray") == 0)
-		goto slicearray;
-	if(strcmp(n->left->sym->name, "sliceslice") == 0) {
-		narg = 4;
-		goto sliceslice;
-	}
-	if(strcmp(n->left->sym->name, "sliceslice1") == 0) {
-		narg = 3;
-		goto sliceslice;
-	}
-	goto no;
-
-slicearray:
-	if(!sleasy(res))
-		goto no;
-	if(!fix64(n->list, 5))
-		goto no;
-	getargs(n->list, nodes, 5);
-
-	// if(hb[3] > nel[1]) goto throw
-	cmpandthrow(&nodes[3], &nodes[1]);
-
-	// if(lb[2] > hb[3]) goto throw
-	cmpandthrow(&nodes[2], &nodes[3]);
-
-	// len = hb[3] - lb[2] (destroys hb)
-	n2 = *res;
-	n2.xoffset += Array_nel;
-	n2.type = types[TUINT32];
-
-	if(smallintconst(&nodes[3]) && smallintconst(&nodes[2])) {
-		v = mpgetfix(nodes[3].val.u.xval) -
-			mpgetfix(nodes[2].val.u.xval);
-		nodconst(&n1, types[TUINT32], v);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-	} else {
-		regalloc(&n1, types[TUINT32], &nodes[3]);
-		gmove(&nodes[3], &n1);
-		if(!smallintconst(&nodes[2]) || mpgetfix(nodes[2].val.u.xval) != 0)
-			gins(optoas(OSUB, types[TUINT32]), &nodes[2], &n1);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		regfree(&n1);
-	}
+	Node *nl, *nr, *r;
+	Node n1, n2, nt;
+	int a;
 
-	// cap = nel[1] - lb[2] (destroys nel)
-	n2 = *res;
-	n2.xoffset += Array_cap;
-	n2.type = types[TUINT32];
+	nl = n->left;
+	nr = n->right;
+	switch(n->op) {
+	default:
+		dump("cgen_floatsse", n);
+		fatal("cgen_floatsse %O", n->op);
+		return;
 
-	if(smallintconst(&nodes[1]) && smallintconst(&nodes[2])) {
-		v = mpgetfix(nodes[1].val.u.xval) -
-			mpgetfix(nodes[2].val.u.xval);
-		nodconst(&n1, types[TUINT32], v);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-	} else {
-		regalloc(&n1, types[TUINT32], &nodes[1]);
-		gmove(&nodes[1], &n1);
-		if(!smallintconst(&nodes[2]) || mpgetfix(nodes[2].val.u.xval) != 0)
-			gins(optoas(OSUB, types[TUINT32]), &nodes[2], &n1);
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		regfree(&n1);
+	case OMINUS:
+	case OCOM:
+		nr = nodintconst(-1);
+		convlit(&nr, n->type);
+		a = foptoas(OMUL, nl->type, 0);
+		goto sbop;
+
+	// symmetric binary
+	case OADD:
+	case OMUL:
+		a = foptoas(n->op, nl->type, 0);
+		goto sbop;
+
+	// asymmetric binary
+	case OSUB:
+	case OMOD:
+	case ODIV:
+		a = foptoas(n->op, nl->type, 0);
+		goto abop;
 	}
 
-	// if slice could be too big, dereference to
-	// catch nil array pointer.
-	if(nodes[0].op == OREGISTER && nodes[0].type->type->width >= unmappedzero) {
-		n2 = nodes[0];
-		n2.xoffset = 0;
-		n2.op = OINDREG;
-		n2.type = types[TUINT8];
-		gins(ATESTB, nodintconst(0), &n2);
+sbop:	// symmetric binary
+	if(nl->ullman < nr->ullman || nl->op == OLITERAL) {
+		r = nl;
+		nl = nr;
+		nr = r;
 	}
 
-	// ary = old[0] + (lb[2] * width[4]) (destroys old)
-	n2 = *res;
-	n2.xoffset += Array_array;
-	n2.type = types[tptr];
-
-	if(smallintconst(&nodes[2]) && smallintconst(&nodes[4])) {
-		v = mpgetfix(nodes[2].val.u.xval) *
-			mpgetfix(nodes[4].val.u.xval);
-		if(v != 0) {
-			nodconst(&n1, types[tptr], v);
-			gins(optoas(OADD, types[tptr]), &n1, &nodes[0]);
-		}
+abop:	// asymmetric binary
+	if(nl->ullman >= nr->ullman) {
+		tempname(&nt, nl->type);
+		cgen(nl, &nt);
+		mgen(nr, &n2, N);
+		regalloc(&n1, nl->type, res);
+		gmove(&nt, &n1);
+		gins(a, &n2, &n1);
+		gmove(&n1, res);
+		regfree(&n1);
+		mfree(&n2);
 	} else {
-		regalloc(&n1, types[tptr], &nodes[2]);
-		gmove(&nodes[2], &n1);
-		if(!smallintconst(&nodes[4]) || mpgetfix(nodes[4].val.u.xval) != 1)
-			gins(optoas(OMUL, types[tptr]), &nodes[4], &n1);
-		gins(optoas(OADD, types[tptr]), &n1, &nodes[0]);
+		regalloc(&n2, nr->type, res);
+		cgen(nr, &n2);
+		regalloc(&n1, nl->type, N);
+		cgen(nl, &n1);
+		gins(a, &n2, &n1);
+		regfree(&n2);
+		gmove(&n1, res);
 		regfree(&n1);
 	}
-	gins(optoas(OAS, types[tptr]), &nodes[0], &n2);
-
-	for(i=0; i<5; i++) {
-		if(nodes[i].op == OREGISTER)
-			regfree(&nodes[i]);
-	}
-	return 1;
+	return;
+}
 
-sliceslice:
-	if(!fix64(n->list, narg))
-		goto no;
-	nochk = n->etype;  // skip bounds checking
-	ntemp.op = OXXX;
-	if(!sleasy(n->list->n->right)) {
-		Node *n0;
-		
-		n0 = n->list->n->right;
-		tempname(&ntemp, res->type);
-		cgen(n0, &ntemp);
-		n->list->n->right = &ntemp;
-		getargs(n->list, nodes, narg);
-		n->list->n->right = n0;
-	} else
-		getargs(n->list, nodes, narg);
+void
+bgen_float(Node *n, int true, int likely, Prog *to)
+{
+	int et, a;
+	Node *nl, *nr, *r;
+	Node n1, n2, n3, tmp, t1, t2, ax;
+	Prog *p1, *p2;
 
-	nres = *res;		// result
-	if(!sleasy(res)) {
-		if(ntemp.op == OXXX)
-			tempname(&ntemp, res->type);
-		nres = ntemp;
+	nl = n->left;
+	nr = n->right;
+	a = n->op;
+	if(!true) {
+		// brcom is not valid on floats when NaN is involved.
+		p1 = gbranch(AJMP, T, 0);
+		p2 = gbranch(AJMP, T, 0);
+		patch(p1, pc);
+		// No need to avoid re-genning ninit.
+		bgen_float(n, 1, -likely, p2);
+		patch(gbranch(AJMP, T, 0), to);
+		patch(p2, pc);
+		return;
 	}
 
-	if(narg == 3) {	// old[lb:]
-		// move width to where it would be for old[lb:hb]
-		nodes[3] = nodes[2];
-		nodes[2].op = OXXX;
-		
-		// if(lb[1] > old.nel[0]) goto throw;
-		n2 = nodes[0];
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-		if(!nochk)
-			cmpandthrow(&nodes[1], &n2);
-
-		// ret.nel = old.nel[0]-lb[1];
-		n2 = nodes[0];
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-	
-		regalloc(&n1, types[TUINT32], N);
-		gins(optoas(OAS, types[TUINT32]), &n2, &n1);
-		if(!smallintconst(&nodes[1]) || mpgetfix(nodes[1].val.u.xval) != 0)
-			gins(optoas(OSUB, types[TUINT32]), &nodes[1], &n1);
-	
-		n2 = nres;
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-		gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		regfree(&n1);
-	} else {	// old[lb:hb]
-		n2 = nodes[0];
-		n2.xoffset += Array_cap;
-		n2.type = types[TUINT32];
-		if (!nochk) {
-			// if(hb[2] > old.cap[0]) goto throw;
-			cmpandthrow(&nodes[2], &n2);
-			// if(lb[1] > hb[2]) goto throw;
-			cmpandthrow(&nodes[1], &nodes[2]);
-		}
-
-		// ret.len = hb[2]-lb[1]; (destroys hb[2])
-		n2 = nres;
-		n2.xoffset += Array_nel;
-		n2.type = types[TUINT32];
-
-		if(smallintconst(&nodes[2]) && smallintconst(&nodes[1])) {
-			v = mpgetfix(nodes[2].val.u.xval) -
-				mpgetfix(nodes[1].val.u.xval);
-			nodconst(&n1, types[TUINT32], v);
-			gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-		} else {
-			regalloc(&n1, types[TUINT32], &nodes[2]);
-			gmove(&nodes[2], &n1);
-			if(!smallintconst(&nodes[1]) || mpgetfix(nodes[1].val.u.xval) != 0)
-				gins(optoas(OSUB, types[TUINT32]), &nodes[1], &n1);
-			gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-			regfree(&n1);
-		}
+	if(use_sse)
+		goto sse;
+	else
+		goto x87;
+
+x87:
+	a = brrev(a);	// because the args are stacked
+	if(a == OGE || a == OGT) {
+		// only < and <= work right with NaN; reverse if needed
+		r = nr;
+		nr = nl;
+		nl = r;
+		a = brrev(a);
 	}
 
-	// ret.cap = old.cap[0]-lb[1]; (uses hb[2])
-	n2 = nodes[0];
-	n2.xoffset += Array_cap;
-	n2.type = types[TUINT32];
-
-	regalloc(&n1, types[TUINT32], &nodes[2]);
-	gins(optoas(OAS, types[TUINT32]), &n2, &n1);
-	if(!smallintconst(&nodes[1]) || mpgetfix(nodes[1].val.u.xval) != 0)
-		gins(optoas(OSUB, types[TUINT32]), &nodes[1], &n1);
-
-	n2 = nres;
-	n2.xoffset += Array_cap;
-	n2.type = types[TUINT32];
-	gins(optoas(OAS, types[TUINT32]), &n1, &n2);
-	regfree(&n1);
-
-	// ret.array = old.array[0]+lb[1]*width[3]; (uses lb[1])
-	n2 = nodes[0];
-	n2.xoffset += Array_array;
-	n2.type = types[tptr];
-
-	regalloc(&n1, types[tptr], &nodes[1]);
-	if(smallintconst(&nodes[1]) && smallintconst(&nodes[3])) {
-		gins(optoas(OAS, types[tptr]), &n2, &n1);
-		v = mpgetfix(nodes[1].val.u.xval) *
-			mpgetfix(nodes[3].val.u.xval);
-		if(v != 0) {
-			nodconst(&n2, types[tptr], v);
-			gins(optoas(OADD, types[tptr]), &n2, &n1);
+	nodreg(&tmp, nr->type, D_F0);
+	nodreg(&n2, nr->type, D_F0 + 1);
+	nodreg(&ax, types[TUINT16], D_AX);
+	et = simsimtype(nr->type);
+	if(et == TFLOAT64) {
+		if(nl->ullman > nr->ullman) {
+			cgen(nl, &tmp);
+			cgen(nr, &tmp);
+			gins(AFXCHD, &tmp, &n2);
+		} else {
+			cgen(nr, &tmp);
+			cgen(nl, &tmp);
 		}
+		gins(AFUCOMIP, &tmp, &n2);
+		gins(AFMOVDP, &tmp, &tmp);	// annoying pop but still better than STSW+SAHF
 	} else {
-		gmove(&nodes[1], &n1);
-		if(!smallintconst(&nodes[3]) || mpgetfix(nodes[3].val.u.xval) != 1)
-			gins(optoas(OMUL, types[tptr]), &nodes[3], &n1);
-		gins(optoas(OADD, types[tptr]), &n2, &n1);
+		// TODO(rsc): The moves back and forth to memory
+		// here are for truncating the value to 32 bits.
+		// This handles 32-bit comparison but presumably
+		// all the other ops have the same problem.
+		// We need to figure out what the right general
+		// solution is, besides telling people to use float64.
+		tempname(&t1, types[TFLOAT32]);
+		tempname(&t2, types[TFLOAT32]);
+		cgen(nr, &t1);
+		cgen(nl, &t2);
+		gmove(&t2, &tmp);
+		gins(AFCOMFP, &t1, &tmp);
+		gins(AFSTSW, N, &ax);
+		gins(ASAHF, N, N);
 	}
 
-	n2 = nres;
-	n2.xoffset += Array_array;
-	n2.type = types[tptr];
-	gins(optoas(OAS, types[tptr]), &n1, &n2);
-	regfree(&n1);
+	goto ret;
 
-	for(i=0; i<4; i++) {
-		if(nodes[i].op == OREGISTER)
-			regfree(&nodes[i]);
+sse:
+	if(!nl->addable) {
+		tempname(&n1, nl->type);
+		cgen(nl, &n1);
+		nl = &n1;
+	}
+	if(!nr->addable) {
+		tempname(&tmp, nr->type);
+		cgen(nr, &tmp);
+		nr = &tmp;
+	}
+	regalloc(&n2, nr->type, N);
+	gmove(nr, &n2);
+	nr = &n2;
+
+	if(nl->op != OREGISTER) {
+		regalloc(&n3, nl->type, N);
+		gmove(nl, &n3);
+		nl = &n3;
 	}
 
-	if(!sleasy(res)) {
-		cgen(&nres, res);
+	if(a == OGE || a == OGT) {
+		// only < and <= work right with NaN; reverse if needed
+		r = nr;
+		nr = nl;
+		nl = r;
+		a = brrev(a);
 	}
-	return 1;
 
-no:
-	return 0;
+	gins(foptoas(OCMP, nr->type, 0), nl, nr);
+	if(nl->op == OREGISTER)
+		regfree(nl);
+	regfree(nr);
+
+ret:
+	if(a == OEQ) {
+		// neither NE nor P
+		p1 = gbranch(AJNE, T, -likely);
+		p2 = gbranch(AJPS, T, -likely);
+		patch(gbranch(AJMP, T, 0), to);
+		patch(p1, pc);
+		patch(p2, pc);
+	} else if(a == ONE) {
+		// either NE or P
+		patch(gbranch(AJNE, T, likely), to);
+		patch(gbranch(AJPS, T, likely), to);
+	} else
+		patch(gbranch(optoas(a, nr->type), T, likely), to);
+
 }