1 files changed, 611 insertions, 198 deletions
diff --git a/src/cmd/8g/gsubr.c b/src/cmd/8g/gsubr.c
index 5e89af04a..c4c184bb9 100644
--- a/src/cmd/8g/gsubr.c
+++ b/src/cmd/8g/gsubr.c
@@ -105,16 +105,24 @@ dumpdata(void)
 /*
  * generate a branch.
  * t is ignored.
+ * likely values are for branch prediction:
+ *	-1 unlikely
+ *	0 no opinion
+ *	+1 likely
  */
 Prog*
-gbranch(int as, Type *t)
+gbranch(int as, Type *t, int likely)
 {
 	Prog *p;
 
 	USED(t);
 	p = prog(as);
 	p->to.type = D_BRANCH;
-	p->to.branch = P;
+	p->to.u.branch = P;
+	if(likely != 0) {
+		p->from.type = D_CONST;
+		p->from.offset = likely > 0;
+	}
 	return p;
 }
 
@@ -126,7 +134,7 @@ patch(Prog *p, Prog *to)
 {
 	if(p->to.type != D_BRANCH)
 		fatal("patch: not a branch");
-	p->to.branch = to;
+	p->to.u.branch = to;
 	p->to.offset = to->loc;
 }
 
@@ -137,8 +145,8 @@ unpatch(Prog *p)
 
 	if(p->to.type != D_BRANCH)
 		fatal("unpatch: not a branch");
-	q = p->to.branch;
-	p->to.branch = P;
+	q = p->to.u.branch;
+	p->to.u.branch = P;
 	p->to.offset = 0;
 	return q;
 }
@@ -166,44 +174,6 @@ newplist(void)
 }
 
 void
-clearstk(void)
-{
-	Plist *pl;
-	Prog *p1, *p2;
-	Node sp, di, cx, con, ax;
-
-	if(plast->firstpc->to.offset <= 0)
-		return;
-
-	// reestablish context for inserting code
-	// at beginning of function.
-	pl = plast;
-	p1 = pl->firstpc;
-	p2 = p1->link;
-	pc = mal(sizeof(*pc));
-	clearp(pc);
-	p1->link = pc;
-	
-	// zero stack frame
-	nodreg(&sp, types[tptr], D_SP);
-	nodreg(&di, types[tptr], D_DI);
-	nodreg(&cx, types[TUINT32], D_CX);
-	nodconst(&con, types[TUINT32], p1->to.offset / widthptr);
-	gins(ACLD, N, N);
-	gins(AMOVL, &sp, &di);
-	gins(AMOVL, &con, &cx);
-	nodconst(&con, types[TUINT32], 0);
-	nodreg(&ax, types[TUINT32], D_AX);
-	gins(AMOVL, &con, &ax);
-	gins(AREP, N, N);
-	gins(ASTOSL, N, N);
-
-	// continue with original code.
-	gins(ANOP, N, N)->link = p2;
-	pc = P;
-}	
-
-void
 gused(Node *n)
 {
 	gins(ANOP, n, N);	// used
@@ -214,22 +184,23 @@ gjmp(Prog *to)
 {
 	Prog *p;
 
-	p = gbranch(AJMP, T);
+	p = gbranch(AJMP, T, 0);
 	if(to != P)
 		patch(p, to);
 	return p;
 }
 
 void
-ggloblnod(Node *nam, int32 width)
+ggloblnod(Node *nam)
 {
 	Prog *p;
 
 	p = gins(AGLOBL, nam, N);
 	p->lineno = nam->lineno;
+	p->from.gotype = ngotype(nam);
 	p->to.sym = S;
 	p->to.type = D_CONST;
-	p->to.offset = width;
+	p->to.offset = nam->type->width;
 	if(nam->readonly)
 		p->from.scale = RODATA;
 	if(nam->type != T && !haspointers(nam->type))
@@ -237,7 +208,7 @@ ggloblnod(Node *nam, int32 width)
 }
 
 void
-ggloblsym(Sym *s, int32 width, int dupok)
+ggloblsym(Sym *s, int32 width, int dupok, int rodata)
 {
 	Prog *p;
 
@@ -249,8 +220,20 @@ ggloblsym(Sym *s, int32 width, int dupok)
 	p->to.index = D_NONE;
 	p->to.offset = width;
 	if(dupok)
-		p->from.scale = DUPOK;
-	p->from.scale |= RODATA;
+		p->from.scale |= DUPOK;
+	if(rodata)
+		p->from.scale |= RODATA;
+}
+
+void
+gtrack(Sym *s)
+{
+	Prog *p;
+	
+	p = gins(AUSEFIELD, N, N);
+	p->from.type = D_EXTERN;
+	p->from.index = D_NONE;
+	p->from.sym = s;
 }
 
 int
@@ -273,11 +256,12 @@ isfat(Type *t)
  * call afunclit to fix up the argument.
  */
 void
-afunclit(Addr *a)
+afunclit(Addr *a, Node *n)
 {
 	if(a->type == D_ADDR && a->index == D_EXTERN) {
 		a->type = D_EXTERN;
 		a->index = D_NONE;
+		a->sym = n->sym;
 	}
 }
 
@@ -572,6 +556,22 @@ optoas(int op, Type *t)
 		a = AXORL;
 		break;
 
+	case CASE(OLROT, TINT8):
+	case CASE(OLROT, TUINT8):
+		a = AROLB;
+		break;
+
+	case CASE(OLROT, TINT16):
+	case CASE(OLROT, TUINT16):
+		a = AROLW;
+		break;
+
+	case CASE(OLROT, TINT32):
+	case CASE(OLROT, TUINT32):
+	case CASE(OLROT, TPTR32):
+		a = AROLL;
+		break;
+
 	case CASE(OLSH, TINT8):
 	case CASE(OLSH, TUINT8):
 		a = ASHLB;
@@ -613,22 +613,38 @@ optoas(int op, Type *t)
 		a = ASARL;
 		break;
 
+	case CASE(OHMUL, TINT8):
 	case CASE(OMUL, TINT8):
 	case CASE(OMUL, TUINT8):
 		a = AIMULB;
 		break;
 
+	case CASE(OHMUL, TINT16):
 	case CASE(OMUL, TINT16):
 	case CASE(OMUL, TUINT16):
 		a = AIMULW;
 		break;
 
+	case CASE(OHMUL, TINT32):
 	case CASE(OMUL, TINT32):
 	case CASE(OMUL, TUINT32):
 	case CASE(OMUL, TPTR32):
 		a = AIMULL;
 		break;
 
+	case CASE(OHMUL, TUINT8):
+		a = AMULB;
+		break;
+
+	case CASE(OHMUL, TUINT16):
+		a = AMULW;
+		break;
+
+	case CASE(OHMUL, TUINT32):
+	case CASE(OHMUL, TPTR32):
+		a = AMULL;
+		break;
+
 	case CASE(ODIV, TINT8):
 	case CASE(OMOD, TINT8):
 		a = AIDIVB;
@@ -676,10 +692,14 @@ optoas(int op, Type *t)
 int
 foptoas(int op, Type *t, int flg)
 {
-	int et;
+	int et, a;
 
+	a = AGOK;
 	et = simtype[t->etype];
 
+	if(use_sse)
+		goto sse;
+
 	// If we need Fpop, it means we're working on
 	// two different floating-point registers, not memory.
 	// There the instruction only has a float64 form.
@@ -756,8 +776,65 @@ foptoas(int op, Type *t, int flg)
 
 	fatal("foptoas %O %T %#x", op, t, flg);
 	return 0;
+
+sse:
+	switch(CASE(op, et)) {
+	default:
+		fatal("foptoas-sse: no entry %O-%T", op, t);
+		break;
+
+	case CASE(OCMP, TFLOAT32):
+		a = AUCOMISS;
+		break;
+
+	case CASE(OCMP, TFLOAT64):
+		a = AUCOMISD;
+		break;
+
+	case CASE(OAS, TFLOAT32):
+		a = AMOVSS;
+		break;
+
+	case CASE(OAS, TFLOAT64):
+		a = AMOVSD;
+		break;
+
+	case CASE(OADD, TFLOAT32):
+		a = AADDSS;
+		break;
+
+	case CASE(OADD, TFLOAT64):
+		a = AADDSD;
+		break;
+
+	case CASE(OSUB, TFLOAT32):
+		a = ASUBSS;
+		break;
+
+	case CASE(OSUB, TFLOAT64):
+		a = ASUBSD;
+		break;
+
+	case CASE(OMUL, TFLOAT32):
+		a = AMULSS;
+		break;
+
+	case CASE(OMUL, TFLOAT64):
+		a = AMULSD;
+		break;
+
+	case CASE(ODIV, TFLOAT32):
+		a = ADIVSS;
+		break;
+
+	case CASE(ODIV, TFLOAT64):
+		a = ADIVSD;
+		break;
+	}
+	return a;
 }
 
+
 static	int	resvd[] =
 {
 //	D_DI,	// for movstring
@@ -779,7 +856,9 @@ ginit(void)
 
 	for(i=0; i<nelem(reg); i++)
 		reg[i] = 1;
-	for(i=D_AL; i<=D_DI; i++)
+	for(i=D_AX; i<=D_DI; i++)
+		reg[i] = 0;
+	for(i=D_X0; i<=D_X7; i++)
 		reg[i] = 0;
 	for(i=0; i<nelem(resvd); i++)
 		reg[resvd[i]]++;
@@ -795,9 +874,12 @@ gclean(void)
 	for(i=0; i<nelem(resvd); i++)
 		reg[resvd[i]]--;
 
-	for(i=D_AL; i<=D_DI; i++)
+	for(i=D_AX; i<=D_DI; i++)
 		if(reg[i])
 			yyerror("reg %R left allocated at %ux", i, regpc[i]);
+	for(i=D_X0; i<=D_X7; i++)
+		if(reg[i])
+			yyerror("reg %R left allocated\n", i);
 }
 
 int32
@@ -805,7 +887,7 @@ anyregalloc(void)
 {
 	int i, j;
 
-	for(i=D_AL; i<=D_DI; i++) {
+	for(i=D_AX; i<=D_DI; i++) {
 		if(reg[i] == 0)
 			goto ok;
 		for(j=0; j<nelem(resvd); j++)
@@ -814,6 +896,9 @@ anyregalloc(void)
 		return 1;
 	ok:;
 	}
+	for(i=D_X0; i<=D_X7; i++)
+		if(reg[i])
+			return 1;
 	return 0;
 }
 
@@ -832,14 +917,16 @@ regalloc(Node *n, Type *t, Node *o)
 	et = simtype[t->etype];
 
 	switch(et) {
+	case TINT64:
+	case TUINT64:
+		fatal("regalloc64");
+
 	case TINT8:
 	case TUINT8:
 	case TINT16:
 	case TUINT16:
 	case TINT32:
 	case TUINT32:
-	case TINT64:
-	case TUINT64:
 	case TPTR32:
 	case TPTR64:
 	case TBOOL:
@@ -860,8 +947,22 @@ regalloc(Node *n, Type *t, Node *o)
 
 	case TFLOAT32:
 	case TFLOAT64:
-		i = D_F0;
-		goto out;
+		if(!use_sse) {
+			i = D_F0;
+			goto out;
+		}
+		if(o != N && o->op == OREGISTER) {
+			i = o->val.u.reg;
+			if(i >= D_X0 && i <= D_X7)
+				goto out;
+		}
+		for(i=D_X0; i<=D_X7; i++)
+			if(reg[i] == 0)
+				goto out;
+		fprint(2, "registers allocated at\n");
+		for(i=D_X0; i<=D_X7; i++)
+			fprint(2, "\t%R\t%#lux\n", i, regpc[i]);
+		fatal("out of floating registers");
 	}
 	yyerror("regalloc: unknown type %T", t);
 
@@ -1069,9 +1170,9 @@ split64(Node *n, Node *lo, Node *hi)
 	if(!is64(n->type))
 		fatal("split64 %T", n->type);
 
-	sclean[nsclean].op = OEMPTY;
 	if(nsclean >= nelem(sclean))
 		fatal("split64 clean");
+	sclean[nsclean].op = OEMPTY;
 	nsclean++;
 	switch(n->op) {
 	default:
@@ -1165,13 +1266,16 @@ memname(Node *n, Type *t)
 	n->orig->sym = n->sym;
 }
 
+static void floatmove(Node *f, Node *t);
+static void floatmove_387(Node *f, Node *t);
+static void floatmove_sse(Node *f, Node *t);
+
 void
 gmove(Node *f, Node *t)
 {
 	int a, ft, tt;
 	Type *cvt;
-	Node r1, r2, t1, t2, flo, fhi, tlo, thi, con, f0, f1, ax, dx, cx;
-	Prog *p1, *p2, *p3;
+	Node r1, r2, flo, fhi, tlo, thi, con;
 
 	if(debug['M'])
 		print("gmove %N -> %N\n", f, t);
@@ -1179,11 +1283,15 @@ gmove(Node *f, Node *t)
 	ft = simsimtype(f->type);
 	tt = simsimtype(t->type);
 	cvt = t->type;
-
+	
 	if(iscomplex[ft] || iscomplex[tt]) {
 		complexmove(f, t);
 		return;
 	}
+	if(isfloat[ft] || isfloat[tt]) {
+		floatmove(f, t);
+		return;
+	}
 
 	// cannot have two integer memory operands;
 	// except 64-bit, which always copies via registers anyway.
@@ -1192,19 +1300,9 @@ gmove(Node *f, Node *t)
 
 	// convert constant to desired type
 	if(f->op == OLITERAL) {
-		if(tt == TFLOAT32)
-			convconst(&con, types[TFLOAT64], &f->val);
-		else
-			convconst(&con, t->type, &f->val);
+		convconst(&con, t->type, &f->val);
 		f = &con;
 		ft = simsimtype(con.type);
-
-		// some constants can't move directly to memory.
-		if(ismem(t)) {
-			// float constants come from memory.
-			if(isfloat[tt])
-				goto hard;
-		}
 	}
 
 	// value -> value copy, only one memory operand.
@@ -1380,6 +1478,272 @@ gmove(Node *f, Node *t)
 		gins(AMOVL, ncon(0), &thi);
 		splitclean();
 		return;
+	}
+
+	gins(a, f, t);
+	return;
+
+rsrc:
+	// requires register source
+	regalloc(&r1, f->type, t);
+	gmove(f, &r1);
+	gins(a, &r1, t);
+	regfree(&r1);
+	return;
+
+rdst:
+	// requires register destination
+	regalloc(&r1, t->type, t);
+	gins(a, f, &r1);
+	gmove(&r1, t);
+	regfree(&r1);
+	return;
+
+hard:
+	// requires register intermediate
+	regalloc(&r1, cvt, t);
+	gmove(f, &r1);
+	gmove(&r1, t);
+	regfree(&r1);
+	return;
+
+fatal:
+	// should not happen
+	fatal("gmove %N -> %N", f, t);
+}
+
+static void
+floatmove(Node *f, Node *t)
+{
+	Node r1, r2, t1, t2, tlo, thi, con, f0, f1, ax, dx, cx;
+	Type *cvt;
+	int ft, tt;
+	Prog *p1, *p2, *p3;
+
+	ft = simsimtype(f->type);
+	tt = simsimtype(t->type);
+	cvt = t->type;
+
+	// cannot have two floating point memory operands.
+	if(isfloat[ft] && isfloat[tt] && ismem(f) && ismem(t))
+		goto hard;
+
+	// convert constant to desired type
+	if(f->op == OLITERAL) {
+		convconst(&con, t->type, &f->val);
+		f = &con;
+		ft = simsimtype(con.type);
+
+		// some constants can't move directly to memory.
+		if(ismem(t)) {
+			// float constants come from memory.
+			if(isfloat[tt])
+				goto hard;
+		}
+	}
+
+	// value -> value copy, only one memory operand.
+	// figure out the instruction to use.
+	// break out of switch for one-instruction gins.
+	// goto rdst for "destination must be register".
+	// goto hard for "convert to cvt type first".
+	// otherwise handle and return.
+
+	switch(CASE(ft, tt)) {
+	default:
+		if(use_sse)
+			floatmove_sse(f, t);
+		else
+			floatmove_387(f, t);
+		return;
+
+	// float to very long integer.
+	case CASE(TFLOAT32, TINT64):
+	case CASE(TFLOAT64, TINT64):
+		if(f->op == OREGISTER) {
+			cvt = f->type;
+			goto hardmem;
+		}
+		nodreg(&r1, types[ft], D_F0);
+		if(ft == TFLOAT32)
+			gins(AFMOVF, f, &r1);
+		else
+			gins(AFMOVD, f, &r1);
+
+		// set round to zero mode during conversion
+		memname(&t1, types[TUINT16]);
+		memname(&t2, types[TUINT16]);
+		gins(AFSTCW, N, &t1);
+		gins(AMOVW, ncon(0xf7f), &t2);
+		gins(AFLDCW, &t2, N);
+		if(tt == TINT16)
+			gins(AFMOVWP, &r1, t);
+		else if(tt == TINT32)
+			gins(AFMOVLP, &r1, t);
+		else
+			gins(AFMOVVP, &r1, t);
+		gins(AFLDCW, &t1, N);
+		return;
+
+	case CASE(TFLOAT32, TUINT64):
+	case CASE(TFLOAT64, TUINT64):
+		if(!ismem(f)) {
+			cvt = f->type;
+			goto hardmem;
+		}
+		bignodes();
+		nodreg(&f0, types[ft], D_F0);
+		nodreg(&f1, types[ft], D_F0 + 1);
+		nodreg(&ax, types[TUINT16], D_AX);
+
+		if(ft == TFLOAT32)
+			gins(AFMOVF, f, &f0);
+		else
+			gins(AFMOVD, f, &f0);
+
+		// if 0 > v { answer = 0 }
+		gins(AFMOVD, &zerof, &f0);
+		gins(AFUCOMIP, &f0, &f1);
+		p1 = gbranch(optoas(OGT, types[tt]), T, 0);
+		// if 1<<64 <= v { answer = 0 too }
+		gins(AFMOVD, &two64f, &f0);
+		gins(AFUCOMIP, &f0, &f1);
+		p2 = gbranch(optoas(OGT, types[tt]), T, 0);
+		patch(p1, pc);
+		gins(AFMOVVP, &f0, t);	// don't care about t, but will pop the stack
+		split64(t, &tlo, &thi);
+		gins(AMOVL, ncon(0), &tlo);
+		gins(AMOVL, ncon(0), &thi);
+		splitclean();
+		p1 = gbranch(AJMP, T, 0);
+		patch(p2, pc);
+
+		// in range; algorithm is:
+		//	if small enough, use native float64 -> int64 conversion.
+		//	otherwise, subtract 2^63, convert, and add it back.
+
+		// set round to zero mode during conversion
+		memname(&t1, types[TUINT16]);
+		memname(&t2, types[TUINT16]);
+		gins(AFSTCW, N, &t1);
+		gins(AMOVW, ncon(0xf7f), &t2);
+		gins(AFLDCW, &t2, N);
+
+		// actual work
+		gins(AFMOVD, &two63f, &f0);
+		gins(AFUCOMIP, &f0, &f1);
+		p2 = gbranch(optoas(OLE, types[tt]), T, 0);
+		gins(AFMOVVP, &f0, t);
+		p3 = gbranch(AJMP, T, 0);
+		patch(p2, pc);
+		gins(AFMOVD, &two63f, &f0);
+		gins(AFSUBDP, &f0, &f1);
+		gins(AFMOVVP, &f0, t);
+		split64(t, &tlo, &thi);
+		gins(AXORL, ncon(0x80000000), &thi);	// + 2^63
+		patch(p3, pc);
+		splitclean();
+		// restore rounding mode
+		gins(AFLDCW, &t1, N);
+
+		patch(p1, pc);
+		return;
+
+	/*
+	 * integer to float
+	 */
+	case CASE(TINT64, TFLOAT32):
+	case CASE(TINT64, TFLOAT64):
+		if(t->op == OREGISTER)
+			goto hardmem;
+		nodreg(&f0, t->type, D_F0);
+		gins(AFMOVV, f, &f0);
+		if(tt == TFLOAT32)
+			gins(AFMOVFP, &f0, t);
+		else
+			gins(AFMOVDP, &f0, t);
+		return;
+
+	case CASE(TUINT64, TFLOAT32):
+	case CASE(TUINT64, TFLOAT64):
+		// algorithm is:
+		//	if small enough, use native int64 -> float64 conversion.
+		//	otherwise, halve (rounding to odd?), convert, and double.
+		nodreg(&ax, types[TUINT32], D_AX);
+		nodreg(&dx, types[TUINT32], D_DX);
+		nodreg(&cx, types[TUINT32], D_CX);
+		tempname(&t1, f->type);
+		split64(&t1, &tlo, &thi);
+		gmove(f, &t1);
+		gins(ACMPL, &thi, ncon(0));
+		p1 = gbranch(AJLT, T, 0);
+		// native
+		t1.type = types[TINT64];
+		nodreg(&r1, types[tt], D_F0);
+		gins(AFMOVV, &t1, &r1);
+		if(tt == TFLOAT32)
+			gins(AFMOVFP, &r1, t);
+		else
+			gins(AFMOVDP, &r1, t);
+		p2 = gbranch(AJMP, T, 0);
+		// simulated
+		patch(p1, pc);
+		gmove(&tlo, &ax);
+		gmove(&thi, &dx);
+		p1 = gins(ASHRL, ncon(1), &ax);
+		p1->from.index = D_DX;	// double-width shift DX -> AX
+		p1->from.scale = 0;
+		gins(AMOVL, ncon(0), &cx);
+		gins(ASETCC, N, &cx);
+		gins(AORL, &cx, &ax);
+		gins(ASHRL, ncon(1), &dx);
+		gmove(&dx, &thi);
+		gmove(&ax, &tlo);
+		nodreg(&r1, types[tt], D_F0);
+		nodreg(&r2, types[tt], D_F0 + 1);
+		gins(AFMOVV, &t1, &r1);
+		gins(AFMOVD, &r1, &r1);
+		gins(AFADDDP, &r1, &r2);
+		if(tt == TFLOAT32)
+			gins(AFMOVFP, &r1, t);
+		else
+			gins(AFMOVDP, &r1, t);
+		patch(p2, pc);
+		splitclean();
+		return;
+	}
+
+hard:
+	// requires register intermediate
+	regalloc(&r1, cvt, t);
+	gmove(f, &r1);
+	gmove(&r1, t);
+	regfree(&r1);
+	return;
+
+hardmem:
+	// requires memory intermediate
+	tempname(&r1, cvt);
+	gmove(f, &r1);
+	gmove(&r1, t);
+	return;
+}
+
+static void
+floatmove_387(Node *f, Node *t)
+{
+	Node r1, t1, t2;
+	Type *cvt;
+	Prog *p1, *p2, *p3;
+	int a, ft, tt;
+
+	ft = simsimtype(f->type);
+	tt = simsimtype(t->type);
+	cvt = t->type;
+
+	switch(CASE(ft, tt)) {
+	default:
+		goto fatal;
 
 	/*
 	* float to integer
@@ -1429,10 +1793,10 @@ gmove(Node *f, Node *t)
 			fatal("gmove %T", t);
 		case TINT8:
 			gins(ACMPL, &t1, ncon(-0x80));
-			p1 = gbranch(optoas(OLT, types[TINT32]), T);
+			p1 = gbranch(optoas(OLT, types[TINT32]), T, -1);
 			gins(ACMPL, &t1, ncon(0x7f));
-			p2 = gbranch(optoas(OGT, types[TINT32]), T);
-			p3 = gbranch(AJMP, T);
+			p2 = gbranch(optoas(OGT, types[TINT32]), T, -1);
+			p3 = gbranch(AJMP, T, 0);
 			patch(p1, pc);
 			patch(p2, pc);
 			gmove(ncon(-0x80), &t1);
@@ -1441,14 +1805,14 @@ gmove(Node *f, Node *t)
 			break;
 		case TUINT8:
 			gins(ATESTL, ncon(0xffffff00), &t1);
-			p1 = gbranch(AJEQ, T);
+			p1 = gbranch(AJEQ, T, +1);
 			gins(AMOVL, ncon(0), &t1);
 			patch(p1, pc);
 			gmove(&t1, t);
 			break;
 		case TUINT16:
 			gins(ATESTL, ncon(0xffff0000), &t1);
-			p1 = gbranch(AJEQ, T);
+			p1 = gbranch(AJEQ, T, +1);
 			gins(AMOVL, ncon(0), &t1);
 			patch(p1, pc);
 			gmove(&t1, t);
@@ -1459,73 +1823,8 @@ gmove(Node *f, Node *t)
 	case CASE(TFLOAT32, TUINT32):
 	case CASE(TFLOAT64, TUINT32):
 		// convert via int64.
-		tempname(&t1, types[TINT64]);
-		gmove(f, &t1);
-		split64(&t1, &tlo, &thi);
-		gins(ACMPL, &thi, ncon(0));
-		p1 = gbranch(AJEQ, T);
-		gins(AMOVL, ncon(0), &tlo);
-		patch(p1, pc);
-		gmove(&tlo, t);
-		splitclean();
-		return;
-
-	case CASE(TFLOAT32, TUINT64):
-	case CASE(TFLOAT64, TUINT64):
-		bignodes();
-		nodreg(&f0, types[ft], D_F0);
-		nodreg(&f1, types[ft], D_F0 + 1);
-		nodreg(&ax, types[TUINT16], D_AX);
-
-		gmove(f, &f0);
-
-		// if 0 > v { answer = 0 }
-		gmove(&zerof, &f0);
-		gins(AFUCOMIP, &f0, &f1);
-		p1 = gbranch(optoas(OGT, types[tt]), T);
-		// if 1<<64 <= v { answer = 0 too }
-		gmove(&two64f, &f0);
-		gins(AFUCOMIP, &f0, &f1);
-		p2 = gbranch(optoas(OGT, types[tt]), T);
-		patch(p1, pc);
-		gins(AFMOVVP, &f0, t);	// don't care about t, but will pop the stack
-		split64(t, &tlo, &thi);
-		gins(AMOVL, ncon(0), &tlo);
-		gins(AMOVL, ncon(0), &thi);
-		splitclean();
-		p1 = gbranch(AJMP, T);
-		patch(p2, pc);
-
-		// in range; algorithm is:
-		//	if small enough, use native float64 -> int64 conversion.
-		//	otherwise, subtract 2^63, convert, and add it back.
-
-		// set round to zero mode during conversion
-		memname(&t1, types[TUINT16]);
-		memname(&t2, types[TUINT16]);
-		gins(AFSTCW, N, &t1);
-		gins(AMOVW, ncon(0xf7f), &t2);
-		gins(AFLDCW, &t2, N);
-
-		// actual work
-		gmove(&two63f, &f0);
-		gins(AFUCOMIP, &f0, &f1);
-		p2 = gbranch(optoas(OLE, types[tt]), T);
-		gins(AFMOVVP, &f0, t);
-		p3 = gbranch(AJMP, T);
-		patch(p2, pc);
-		gmove(&two63f, &f0);
-		gins(AFSUBDP, &f0, &f1);
-		gins(AFMOVVP, &f0, t);
-		split64(t, &tlo, &thi);
-		gins(AXORL, ncon(0x80000000), &thi);	// + 2^63
-		patch(p3, pc);
-		splitclean();
-		// restore rounding mode
-		gins(AFLDCW, &t1, N);
-
-		patch(p1, pc);
-		return;
+		cvt = types[TINT64];
+		goto hardmem;
 
 	/*
 	 * integer to float
@@ -1571,45 +1870,6 @@ gmove(Node *f, Node *t)
 		cvt = types[TINT64];
 		goto hardmem;
 
-	case CASE(TUINT64, TFLOAT32):
-	case CASE(TUINT64, TFLOAT64):
-		// algorithm is:
-		//	if small enough, use native int64 -> uint64 conversion.
-		//	otherwise, halve (rounding to odd?), convert, and double.
-		nodreg(&ax, types[TUINT32], D_AX);
-		nodreg(&dx, types[TUINT32], D_DX);
-		nodreg(&cx, types[TUINT32], D_CX);
-		tempname(&t1, f->type);
-		split64(&t1, &tlo, &thi);
-		gmove(f, &t1);
-		gins(ACMPL, &thi, ncon(0));
-		p1 = gbranch(AJLT, T);
-		// native
-		t1.type = types[TINT64];
-		gmove(&t1, t);
-		p2 = gbranch(AJMP, T);
-		// simulated
-		patch(p1, pc);
-		gmove(&tlo, &ax);
-		gmove(&thi, &dx);
-		p1 = gins(ASHRL, ncon(1), &ax);
-		p1->from.index = D_DX;	// double-width shift DX -> AX
-		p1->from.scale = 0;
-		gins(ASETCC, N, &cx);
-		gins(AORB, &cx, &ax);
-		gins(ASHRL, ncon(1), &dx);
-		gmove(&dx, &thi);
-		gmove(&ax, &tlo);
-		nodreg(&r1, types[tt], D_F0);
-		nodreg(&r2, types[tt], D_F0 + 1);
-		gmove(&t1, &r1);	// t1.type is TINT64 now, set above
-		gins(AFMOVD, &r1, &r1);
-		gins(AFADDDP, &r1, &r2);
-		gmove(&r1, t);
-		patch(p2, pc);
-		splitclean();
-		return;
-
 	/*
 	 * float to float
 	 */
@@ -1673,20 +1933,121 @@ gmove(Node *f, Node *t)
 	gins(a, f, t);
 	return;
 
-rsrc:
-	// requires register source
-	regalloc(&r1, f->type, t);
+hard:
+	// requires register intermediate
+	regalloc(&r1, cvt, t);
 	gmove(f, &r1);
-	gins(a, &r1, t);
+	gmove(&r1, t);
 	regfree(&r1);
 	return;
 
-rdst:
-	// requires register destination
-	regalloc(&r1, t->type, t);
-	gins(a, f, &r1);
+hardmem:
+	// requires memory intermediate
+	tempname(&r1, cvt);
+	gmove(f, &r1);
 	gmove(&r1, t);
-	regfree(&r1);
+	return;
+
+fatal:
+	// should not happen
+	fatal("gmove %lN -> %lN", f, t);
+	return;
+}
+
+static void
+floatmove_sse(Node *f, Node *t)
+{
+	Node r1;
+	Type *cvt;
+	int a, ft, tt;
+
+	ft = simsimtype(f->type);
+	tt = simsimtype(t->type);
+
+	switch(CASE(ft, tt)) {
+	default:
+		// should not happen
+		fatal("gmove %N -> %N", f, t);
+		return;
+	/*
+	* float to integer
+	*/
+	case CASE(TFLOAT32, TINT16):
+	case CASE(TFLOAT32, TINT8):
+	case CASE(TFLOAT32, TUINT16):
+	case CASE(TFLOAT32, TUINT8):
+	case CASE(TFLOAT64, TINT16):
+	case CASE(TFLOAT64, TINT8):
+	case CASE(TFLOAT64, TUINT16):
+	case CASE(TFLOAT64, TUINT8):
+		// convert via int32.
+		cvt = types[TINT32];
+		goto hard;
+
+	case CASE(TFLOAT32, TUINT32):
+	case CASE(TFLOAT64, TUINT32):
+		// convert via int64.
+		cvt = types[TINT64];
+		goto hardmem;
+
+	case CASE(TFLOAT32, TINT32):
+		a = ACVTTSS2SL;
+		goto rdst;
+
+	case CASE(TFLOAT64, TINT32):
+		a = ACVTTSD2SL;
+		goto rdst;
+
+	/*
+	 * integer to float
+	 */
+	case CASE(TINT8, TFLOAT32):
+	case CASE(TINT8, TFLOAT64):
+	case CASE(TINT16, TFLOAT32):
+	case CASE(TINT16, TFLOAT64):
+	case CASE(TUINT16, TFLOAT32):
+	case CASE(TUINT16, TFLOAT64):
+	case CASE(TUINT8, TFLOAT32):
+	case CASE(TUINT8, TFLOAT64):
+		// convert via int32 memory
+		cvt = types[TINT32];
+		goto hard;
+
+	case CASE(TUINT32, TFLOAT32):
+	case CASE(TUINT32, TFLOAT64):
+		// convert via int64 memory
+		cvt = types[TINT64];
+		goto hardmem;
+
+	case CASE(TINT32, TFLOAT32):
+		a = ACVTSL2SS;
+		goto rdst;
+
+	case CASE(TINT32, TFLOAT64):
+		a = ACVTSL2SD;
+		goto rdst;
+
+	/*
+	 * float to float
+	 */
+	case CASE(TFLOAT32, TFLOAT32):
+		a = AMOVSS;
+		break;
+
+	case CASE(TFLOAT64, TFLOAT64):
+		a = AMOVSD;
+		break;
+
+	case CASE(TFLOAT32, TFLOAT64):
+		a = ACVTSS2SD;
+		goto rdst;
+
+	case CASE(TFLOAT64, TFLOAT32):
+		a = ACVTSD2SS;
+		goto rdst;
+	}
+
+	gins(a, f, t);
 	return;
 
 hard:
@@ -1704,9 +2065,13 @@ hardmem:
 	gmove(&r1, t);
 	return;
 
-fatal:
-	// should not happen
-	fatal("gmove %N -> %N", f, t);
+rdst:
+	// requires register destination
+	regalloc(&r1, t->type, t);
+	gins(a, f, &r1);
+	gmove(&r1, t);
+	regfree(&r1);
+	return;
 }
 
 int
@@ -1737,6 +2102,10 @@ gins(int as, Node *f, Node *t)
 
 	if(as == AFMOVF && f && f->op == OREGISTER && t && t->op == OREGISTER)
 		fatal("gins MOVF reg, reg");
+	if(as == ACVTSD2SS && f && f->op == OLITERAL)
+		fatal("gins CVTSD2SS const");
+	if(as == AMOVSD && t && t->op == OREGISTER && t->val.u.reg == D_F0)
+		fatal("gins MOVSD into F0");
 
 	switch(as) {
 	case AMOVB:
@@ -1744,6 +2113,12 @@ gins(int as, Node *f, Node *t)
 	case AMOVL:
 		if(f != N && t != N && samaddr(f, t))
 			return nil;
+		break;
+	
+	case ALEAL:
+		if(f != N && isconst(f, CTNIL))
+			fatal("gins LEAL nil %T", f->type);
+		break;
 	}
 
 	memset(&af, 0, sizeof af);
@@ -1782,6 +2157,25 @@ gins(int as, Node *f, Node *t)
 	return p;
 }
 
+// Generate an instruction referencing *n
+// to force segv on nil pointer dereference.
+void
+checkref(Node *n)
+{
+	Node m;
+
+	if(n->type->type->width < unmappedzero)
+		return;
+
+	regalloc(&m, types[TUINTPTR], n);
+	cgen(n, &m);
+	m.xoffset = 0;
+	m.op = OINDREG;
+	m.type = types[TUINT8];
+	gins(ATESTB, nodintconst(0), &m);
+	regfree(&m);
+}
+
 static void
 checkoffset(Addr *a, int canemitcode)
 {
@@ -1807,6 +2201,8 @@ checkoffset(Addr *a, int canemitcode)
 void
 naddr(Node *n, Addr *a, int canemitcode)
 {
+	Prog *p;
+
 	a->scale = 0;
 	a->index = D_NONE;
 	a->type = D_NONE;
@@ -1842,13 +2238,29 @@ naddr(Node *n, Addr *a, int canemitcode)
 		a->node = n->left->orig;
 		break;
 
+	case OCLOSUREVAR:
+		if(!canemitcode)
+			fatal("naddr OCLOSUREVAR cannot emit code");
+		p = gins(AMOVL, N, N);
+		p->from.type = D_DX+D_INDIR;
+		p->from.offset = n->xoffset;
+		p->to.type = D_BX;
+		a->type = D_BX;
+		a->sym = S;
+		break;
+
+	case OCFUNC:
+		naddr(n->left, a, canemitcode);
+		a->sym = n->left->sym;
+		break;
+
 	case ONAME:
 		a->etype = 0;
 		a->width = 0;
 		if(n->type != T) {
 			a->etype = simtype[n->type->etype];
+			dowidth(n->type);
 			a->width = n->type->width;
-			a->gotype = ngotype(n);
 		}
 		a->offset = n->xoffset;
 		a->sym = n->sym;
@@ -1880,6 +2292,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 		case PFUNC:
 			a->index = D_EXTERN;
 			a->type = D_ADDR;
+			a->sym = funcsym(a->sym);
 			break;
 		}
 		break;
@@ -1891,7 +2304,7 @@ naddr(Node *n, Addr *a, int canemitcode)
 			break;
 		case CTFLT:
 			a->type = D_FCONST;
-			a->dval = mpgetflt(n->val.u.fval);
+			a->u.dval = mpgetflt(n->val.u.fval);
 			break;
 		case CTINT:
 		case CTRUNE: