diff options
Diffstat (limited to 'usr/src/lib/libm/common/m9x/fma.c')
| -rw-r--r-- | usr/src/lib/libm/common/m9x/fma.c | 497 |
1 files changed, 497 insertions, 0 deletions
diff --git a/usr/src/lib/libm/common/m9x/fma.c b/usr/src/lib/libm/common/m9x/fma.c new file mode 100644 index 0000000000..f06349a2c4 --- /dev/null +++ b/usr/src/lib/libm/common/m9x/fma.c @@ -0,0 +1,497 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#if defined(ELFOBJ) +#pragma weak fma = __fma +#endif + +#include "libm.h" +#include "fma.h" +#include "fenv_inlines.h" + +#if defined(__sparc) + +static const union { + unsigned i[2]; + double d; +} C[] = { + { 0x3fe00000u, 0 }, + { 0x40000000u, 0 }, + { 0x43300000u, 0 }, + { 0x41a00000u, 0 }, + { 0x3e500000u, 0 }, + { 0x3df00000u, 0 }, + { 0x3bf00000u, 0 }, + { 0x7fe00000u, 0 }, + { 0x00100000u, 0 }, + { 0x00100001u, 0 } +}; + +#define half C[0].d +#define two C[1].d +#define two52 C[2].d +#define two27 C[3].d +#define twom26 C[4].d +#define twom32 C[5].d +#define twom64 C[6].d +#define huge C[7].d +#define tiny C[8].d +#define tiny2 C[9].d + +static const unsigned int fsr_rm = 0xc0000000u; + +/* + * fma for SPARC: 64-bit double precision, big-endian + */ +double +__fma(double x, double y, double z) { + union { + unsigned i[2]; + double d; + } xx, yy, zz; + double xhi, yhi, xlo, ylo, t; + unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky; + int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit; + volatile double dummy; + + /* extract the high order words of the arguments */ + xx.d = x; + yy.d = y; + zz.d = z; + hx = xx.i[0] & ~0x80000000; + hy = yy.i[0] & ~0x80000000; + hz = zz.i[0] & ~0x80000000; + + /* dispense with inf, nan, and zero cases */ + if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 || + (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */ + return (x * y + z); + + if (hz >= 0x7ff00000) /* z is inf or nan */ + return (x + z); /* avoid spurious under/overflow in x * y */ + + if ((hz | zz.i[1]) == 0) /* z is zero */ + /* + * x * y isn't zero but could underflow to zero, + * so don't add z, lest we perturb the sign + */ + return (x * y); + + /* + * now x, y, and z are all finite and nonzero; save the fsr and + * set round-to-negative-infinity mode (and clear nonstandard + * mode before we try to scale subnormal operands) + */ + __fenv_getfsr32(&fsr); + __fenv_setfsr32(&fsr_rm); + + /* extract signs and exponents, and normalize subnormals */ + sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000; + sz = zz.i[0] & 0x80000000; + ex = hx >> 20; + if (!ex) { + xx.d = x * two52; + ex = ((xx.i[0] & ~0x80000000) >> 20) - 52; + } + ey = hy >> 20; + if (!ey) { + yy.d = y * two52; + ey = ((yy.i[0] & ~0x80000000) >> 20) - 52; + } + ez = hz >> 20; + if (!ez) { + zz.d = z * two52; + ez = ((zz.i[0] & ~0x80000000) >> 20) - 52; + } + + /* multiply x*y to 106 bits */ + exy = ex + ey - 0x3ff; + xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000; + yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000; + x = xx.d; + y = yy.d; + xhi = ((x + twom26) + two27) - two27; + yhi = ((y + twom26) + two27) - two27; + xlo = x - xhi; + ylo = y - yhi; + x *= y; + y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo; + if (x >= two) { + x *= half; + y *= half; + exy++; + } + + /* extract the significands */ + xx.d = x; + xy0 = (xx.i[0] & 0xfffff) | 0x100000; + xy1 = xx.i[1]; + yy.d = t = y + twom32; + xy2 = yy.i[1]; + yy.d = (y - (t - twom32)) + twom64; + xy3 = yy.i[1]; + z0 = (zz.i[0] & 0xfffff) | 0x100000; + z1 = zz.i[1]; + z2 = z3 = 0; + + /* + * now x*y is represented by sxy, exy, and xy[0-3], and z is + * represented likewise; swap if need be so |xy| <= |z| + */ + if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 && + (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) { + e = sxy; sxy = sz; sz = e; + e = exy; exy = ez; ez = e; + e = xy0; xy0 = z0; z0 = e; + e = xy1; xy1 = z1; z1 = e; + z2 = xy2; xy2 = 0; + z3 = xy3; xy3 = 0; + } + + /* shift the significand of xy keeping a sticky bit */ + e = ez - exy; + if (e > 116) { + xy0 = xy1 = xy2 = 0; + xy3 = 1; + } else if (e >= 96) { + sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e)); + xy3 = xy0 >> (e - 96); + if (sticky) + xy3 |= 1; + xy0 = xy1 = xy2 = 0; + } else if (e >= 64) { + sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e)); + xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e)); + if (sticky) + xy3 |= 1; + xy2 = xy0 >> (e - 64); + xy0 = xy1 = 0; + } else if (e >= 32) { + sticky = xy3 | ((xy2 << 1) << (63 - e)); + xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e)); + if (sticky) + xy3 |= 1; + xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e)); + xy1 = xy0 >> (e - 32); + xy0 = 0; + } else if (e) { + sticky = (xy3 << 1) << (31 - e); + xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e)); + if (sticky) + xy3 |= 1; + xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e)); + xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e)); + xy0 >>= e; + } + + /* if this is a magnitude subtract, negate the significand of xy */ + if (sxy ^ sz) { + xy0 = ~xy0; + xy1 = ~xy1; + xy2 = ~xy2; + xy3 = -xy3; + if (xy3 == 0) + if (++xy2 == 0) + if (++xy1 == 0) + xy0++; + } + + /* add, propagating carries */ + z3 += xy3; + e = (z3 < xy3); + z2 += xy2; + if (e) { + z2++; + e = (z2 <= xy2); + } else + e = (z2 < xy2); + z1 += xy1; + if (e) { + z1++; + e = (z1 <= xy1); + } else + e = (z1 < xy1); + z0 += xy0; + if (e) + z0++; + + /* postnormalize and collect rounding information into z2 */ + if (ez < 1) { + /* result is tiny; shift right until exponent is within range */ + e = 1 - ez; + if (e > 56) { + z2 = 1; /* result can't be exactly zero */ + z0 = z1 = 0; + } else if (e >= 32) { + sticky = z3 | z2 | ((z1 << 1) << (63 - e)); + z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e)); + if (sticky) + z2 |= 1; + z1 = z0 >> (e - 32); + z0 = 0; + } else { + sticky = z3 | (z2 << 1) << (31 - e); + z2 = (z2 >> e) | ((z1 << 1) << (31 - e)); + if (sticky) + z2 |= 1; + z1 = (z1 >> e) | ((z0 << 1) << (31 - e)); + z0 >>= e; + } + ez = 1; + } else if (z0 >= 0x200000) { + /* carry out; shift right by one */ + sticky = (z2 & 1) | z3; + z2 = (z2 >> 1) | (z1 << 31); + if (sticky) + z2 |= 1; + z1 = (z1 >> 1) | (z0 << 31); + z0 >>= 1; + ez++; + } else { + if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) { + /* + * borrow/cancellation; shift left as much as + * exponent allows + */ + while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) { + z0 = z1; + z1 = z2; + z2 = z3; + z3 = 0; + ez -= 32; + } + while (z0 < 0x100000 && ez > 1) { + z0 = (z0 << 1) | (z1 >> 31); + z1 = (z1 << 1) | (z2 >> 31); + z2 = (z2 << 1) | (z3 >> 31); + z3 <<= 1; + ez--; + } + } + if (z3) + z2 |= 1; + } + + /* get the rounding mode and clear current exceptions */ + rm = fsr >> 30; + fsr &= ~FSR_CEXC; + + /* strip off the integer bit, if there is one */ + ibit = z0 & 0x100000; + if (ibit) + z0 -= 0x100000; + else { + ez = 0; + if (!(z0 | z1 | z2)) { /* exact zero */ + zz.i[0] = rm == FSR_RM ? 0x80000000 : 0; + zz.i[1] = 0; + __fenv_setfsr32(&fsr); + return (zz.d); + } + } + + /* + * flip the sense of directed roundings if the result is negative; + * the logic below applies to a positive result + */ + if (sz) + rm ^= rm >> 1; + + /* round and raise exceptions */ + if (z2) { + fsr |= FSR_NXC; + + /* decide whether to round the fraction up */ + if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u || + (z2 == 0x80000000u && (z1 & 1))))) { + /* round up and renormalize if necessary */ + if (++z1 == 0) { + if (++z0 == 0x100000) { + z0 = 0; + ez++; + } + } + } + } + + /* check for under/overflow */ + if (ez >= 0x7ff) { + if (rm == FSR_RN || rm == FSR_RP) { + zz.i[0] = sz | 0x7ff00000; + zz.i[1] = 0; + } else { + zz.i[0] = sz | 0x7fefffff; + zz.i[1] = 0xffffffff; + } + fsr |= FSR_OFC | FSR_NXC; + } else { + zz.i[0] = sz | (ez << 20) | z0; + zz.i[1] = z1; + + /* + * !ibit => exact result was tiny before rounding, + * z2 nonzero => result delivered is inexact + */ + if (!ibit) { + if (z2) + fsr |= FSR_UFC | FSR_NXC; + else if (fsr & FSR_UFM) + fsr |= FSR_UFC; + } + } + + /* restore the fsr and emulate exceptions as needed */ + if ((fsr & FSR_CEXC) & (fsr >> 23)) { + __fenv_setfsr32(&fsr); + if (fsr & FSR_OFC) { + dummy = huge; + dummy *= huge; + } else if (fsr & FSR_UFC) { + dummy = tiny; + if (fsr & FSR_NXC) + dummy *= tiny; + else + dummy -= tiny2; + } else { + dummy = huge; + dummy += tiny; + } + } else { + fsr |= (fsr & 0x1f) << 5; + __fenv_setfsr32(&fsr); + } + return (zz.d); +} + +#elif defined(__x86) + +#if defined(__amd64) +#define NI 4 +#else +#define NI 3 +#endif + +/* + * fma for x86: 64-bit double precision, little-endian + */ +double +__fma(double x, double y, double z) { + union { + unsigned i[NI]; + long double e; + } xx, yy, zz; + long double xe, ye, xhi, xlo, yhi, ylo; + int ex, ey, ez; + unsigned cwsw, oldcwsw, rm; + + /* convert the operands to double extended */ + xx.e = (long double) x; + yy.e = (long double) y; + zz.e = (long double) z; + + /* extract the exponents of the arguments */ + ex = xx.i[2] & 0x7fff; + ey = yy.i[2] & 0x7fff; + ez = zz.i[2] & 0x7fff; + + /* dispense with inf, nan, and zero cases */ + if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0) + /* x or y is inf, nan, or zero */ + return ((double) (xx.e * yy.e + zz.e)); + + if (ez >= 0x7fff) /* z is inf or nan */ + return ((double) (xx.e + zz.e)); + /* avoid spurious inexact in x * y */ + + /* + * save the control and status words, mask all exceptions, and + * set rounding to 64-bit precision and to-nearest + */ + __fenv_getcwsw(&oldcwsw); + cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000; + __fenv_setcwsw(&cwsw); + + /* multiply x*y to 106 bits */ + xe = xx.e; + xx.i[0] = 0; + xhi = xx.e; /* hi 32 bits */ + xlo = xe - xhi; /* lo 21 bits */ + ye = yy.e; + yy.i[0] = 0; + yhi = yy.e; + ylo = ye - yhi; + xe = xe * ye; + ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo; + + /* distill the sum of xe, ye, and z */ + xhi = ye + zz.e; + yhi = xhi - ye; + xlo = (zz.e - yhi) + (ye - (xhi - yhi)); + /* now (xhi,xlo) = ye + z */ + + yhi = xe + xhi; + ye = yhi - xe; + ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */ + + xhi = xlo + ylo; + xe = xhi - xlo; + xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */ + + yy.e = yhi + xhi; + ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */ + + if (yy.i[1] != 0) { /* yy.e is nonzero */ + /* perturb yy.e if its least significant 10 bits are zero */ + if (!(yy.i[0] & 0x3ff)) { + xx.e = ylo + xlo; + if (xx.i[1] != 0) { + xx.i[2] = (xx.i[2] & 0x8000) | + ((yy.i[2] & 0x7fff) - 63); + xx.i[1] = 0x80000000; + xx.i[0] = 0; + yy.e += xx.e; + } + } + } else { + /* set sign of zero result according to rounding direction */ + rm = oldcwsw & 0x0c000000; + yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0); + } + + /* + * restore the control and status words and convert the result + * to double + */ + __fenv_setcwsw(&oldcwsw); + return ((double) yy.e); +} + +#else +#error Unknown architecture +#endif |
