diff options
Diffstat (limited to 'usr/src/lib/libmvec/common/__vrsqrt.c')
-rw-r--r-- | usr/src/lib/libmvec/common/__vrsqrt.c | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/__vrsqrt.c b/usr/src/lib/libmvec/common/__vrsqrt.c new file mode 100644 index 0000000000..6fb9cd7414 --- /dev/null +++ b/usr/src/lib/libmvec/common/__vrsqrt.c @@ -0,0 +1,415 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/isa_defs.h> +#include "libm_synonyms.h" +#include "libm_inlines.h" + +#ifdef _LITTLE_ENDIAN +#define HI(x) *(1+(int*)x) +#define LO(x) *(unsigned*)x +#else +#define HI(x) *(int*)x +#define LO(x) *(1+(unsigned*)x) +#endif + +#ifdef __RESTRICT +#define restrict _Restrict +#else +#define restrict +#endif + +/* double rsqrt(double x) + * + * Method : + * 1. Special cases: + * for x = NaN => QNaN; + * for x = +Inf => 0; + * for x is negative, -Inf => QNaN + invalid; + * for x = +0 => +Inf + divide-by-zero; + * for x = -0 => -Inf + divide-by-zero. + * 2. Computes reciprocal square root from: + * x = m * 2**n + * Where: + * m = [0.5, 2), + * n = ((exponent + 1) & ~1). + * Then: + * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) + * 2. Computes 1/sqrt(m) from: + * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) + * Where: + * m = m0 + dm, + * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; + * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; + * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128. + * Then: + * 1/sqrt(m0) is looked up in a table, + * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)). + * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: + * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3) + * * z + a2) * z + a1) * z + a0 + * where z = [-1/128, 1/128]. + * + * Accuracy: + * The maximum relative error for the approximating + * polynomial is 2**(-56.26). + * Maximum error observed: less than 0.563 ulp after 1.500.000.000 + * results. + */ + +#define sqrt __sqrt + +extern double sqrt (double); +extern const double __vlibm_TBL_rsqrt[]; + +static void +__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey); + +#pragma no_inline(__vrsqrt_n) + +#define RETURN(ret) \ +{ \ + *py = (ret); \ + py += stridey; \ + if (n_n == 0) \ + { \ + spx = px; spy = py; \ + hx = HI(px); \ + continue; \ + } \ + n--; \ + break; \ +} + +static const double + DONE = 1.0, + K1 = -5.00000000000005209867e-01, + K2 = 3.75000000000004884257e-01, + K3 = -3.12499999317136886551e-01, + K4 = 2.73437499359815081532e-01, + K5 = -2.46116125605037803130e-01, + K6 = 2.25606914648617522896e-01; + +void +__vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey) +{ + double *spx, *spy; + int ax, lx, hx, n_n; + double res; + + while (n > 1) + { + n_n = 0; + spx = px; + spy = py; + hx = HI(px); + for (; n > 1 ; n--) + { + px += stridex; + if (hx >= 0x7ff00000) /* X = NaN or Inf */ + { + res = *(px - stridex); + RETURN (DONE / res) + } + + py += stridey; + + if (hx < 0x00100000) /* X = denormal, zero or negative */ + { + py -= stridey; + ax = hx & 0x7fffffff; + lx = LO((px - stridex)); + res = *(px - stridex); + + if ((ax | lx) == 0) /* |X| = zero */ + { + RETURN (DONE / res) + } + else if (hx >= 0) /* X = denormal */ + { + double res_c0, dsqrt_exp0; + int ind0, sqrt_exp0; + double xx0, dexp_hi0, dexp_lo0; + int hx0, resh0, res_ch0; + + res = *(long long*)&res; + + hx0 = HI(&res); + sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res) = resh0; + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res - res_c0) * xx0; + res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res *= dsqrt_exp0; + + RETURN (res) + } + else /* X = negative */ + { + RETURN (sqrt(res)) + } + } + n_n++; + hx = HI(px); + } + if (n_n > 0) + __vrsqrt_n(n_n, spx, stridex, spy, stridey); + } + if (n > 0) + { + hx = HI(px); + + if (hx >= 0x7ff00000) /* X = NaN or Inf */ + { + res = *px; + *py = DONE / res; + } + else if (hx < 0x00100000) /* X = denormal, zero or negative */ + { + ax = hx & 0x7fffffff; + lx = LO(px); + res = *px; + + if ((ax | lx) == 0) /* |X| = zero */ + { + *py = DONE / res; + } + else if (hx >= 0) /* X = denormal */ + { + double res_c0, dsqrt_exp0; + int ind0, sqrt_exp0; + double xx0, dexp_hi0, dexp_lo0; + int hx0, resh0, res_ch0; + + res = *(long long*)&res; + + hx0 = HI(&res); + sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res) = resh0; + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res - res_c0) * xx0; + res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res *= dsqrt_exp0; + + *py = res; + } + else /* X = negative */ + { + *py = sqrt(res); + } + } + else + { + double res_c0, dsqrt_exp0; + int ind0, sqrt_exp0; + double xx0, dexp_hi0, dexp_lo0; + int resh0, res_ch0; + + sqrt_exp0 = (0x5fe - (hx >> 21)) << 20; + ind0 = (((hx >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res) = resh0; + LO(&res) = LO(px); + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res - res_c0) * xx0; + res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res *= dsqrt_exp0; + + *py = res; + } + } +} + +static void +__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey) +{ + double res0, res_c0, dsqrt_exp0; + double res1, res_c1, dsqrt_exp1; + double res2, res_c2, dsqrt_exp2; + int ind0, sqrt_exp0; + int ind1, sqrt_exp1; + int ind2, sqrt_exp2; + double xx0, dexp_hi0, dexp_lo0; + double xx1, dexp_hi1, dexp_lo1; + double xx2, dexp_hi2, dexp_lo2; + int hx0, resh0, res_ch0; + int hx1, resh1, res_ch1; + int hx2, resh2, res_ch2; + + LO(&dsqrt_exp0) = 0; + LO(&dsqrt_exp1) = 0; + LO(&dsqrt_exp2) = 0; + LO(&res_c0) = 0; + LO(&res_c1) = 0; + LO(&res_c2) = 0; + + for(; n > 2 ; n -= 3) + { + hx0 = HI(px); + LO(&res0) = LO(px); + px += stridex; + + hx1 = HI(px); + LO(&res1) = LO(px); + px += stridex; + + hx2 = HI(px); + LO(&res2) = LO(px); + px += stridex; + + sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; + sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20; + sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16; + ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + resh1 = (hx1 & 0x001fffff) | 0x3fe00000; + resh2 = (hx2 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + res_ch1 = (resh1 + 0x00002000) & 0x7fffc000; + res_ch2 = (resh2 + 0x00002000) & 0x7fffc000; + HI(&res0) = resh0; + HI(&res1) = resh1; + HI(&res2) = resh2; + HI(&res_c0) = res_ch0; + HI(&res_c1) = res_ch1; + HI(&res_c2) = res_ch2; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0]; + dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1]; + dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx1 = dexp_hi1 * dexp_hi1; + xx2 = dexp_hi2 * dexp_hi2; + xx0 = (res0 - res_c0) * xx0; + xx1 = (res1 - res_c1) * xx1; + xx2 = (res2 - res_c2) * xx2; + res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1; + res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2; + + res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; + res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1; + res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2; + + HI(&dsqrt_exp0) = sqrt_exp0; + HI(&dsqrt_exp1) = sqrt_exp1; + HI(&dsqrt_exp2) = sqrt_exp2; + res0 *= dsqrt_exp0; + res1 *= dsqrt_exp1; + res2 *= dsqrt_exp2; + + *py = res0; + py += stridey; + + *py = res1; + py += stridey; + + *py = res2; + py += stridey; + } + + for(; n > 0 ; n--) + { + hx0 = HI(px); + + sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; + ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; + + resh0 = (hx0 & 0x001fffff) | 0x3fe00000; + res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; + HI(&res0) = resh0; + LO(&res0) = LO(px); + HI(&res_c0) = res_ch0; + LO(&res_c0) = 0; + + px += stridex; + + dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; + dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; + xx0 = dexp_hi0 * dexp_hi0; + xx0 = (res0 - res_c0) * xx0; + res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; + + res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; + + HI(&dsqrt_exp0) = sqrt_exp0; + LO(&dsqrt_exp0) = 0; + res0 *= dsqrt_exp0; + + *py = res0; + py += stridey; + } +} + |