summaryrefslogtreecommitdiff
path: root/usr/src/lib/libmvec/common/__vrhypot.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libmvec/common/__vrhypot.c')
-rw-r--r--usr/src/lib/libmvec/common/__vrhypot.c431
1 files changed, 431 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/__vrhypot.c b/usr/src/lib/libmvec/common/__vrhypot.c
new file mode 100644
index 0000000000..dd5b7b6fba
--- /dev/null
+++ b/usr/src/lib/libmvec/common/__vrhypot.c
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/isa_defs.h>
+#include "libm_synonyms.h"
+#include "libm_inlines.h"
+
+#ifdef _LITTLE_ENDIAN
+#define HI(x) *(1+(int*)x)
+#define LO(x) *(unsigned*)x
+#else
+#define HI(x) *(int*)x
+#define LO(x) *(1+(unsigned*)x)
+#endif
+
+#ifdef __RESTRICT
+#define restrict _Restrict
+#else
+#define restrict
+#endif
+
+/* double rhypot(double x, double y)
+ *
+ * Method :
+ * 1. Special cases:
+ * x or y = Inf => 0
+ * x or y = NaN => QNaN
+ * x and y = 0 => Inf + divide-by-zero
+ * 2. Computes rhypot(x,y):
+ * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
+ * Where:
+ * m = 1/max(|x|,|y|)
+ * xnm = x * m
+ * ynm = y * m
+ *
+ * Compute 1/(xnm * xnm + ynm * ynm) by simulating
+ * muti-precision arithmetic.
+ *
+ * Accuracy:
+ * Maximum error observed: less than 0.869 ulp after 1.000.000.000
+ * results.
+ */
+
+#define sqrt __sqrt
+
+extern double sqrt(double);
+
+extern double fabs(double);
+
+static const int __vlibm_TBL_rhypot[] = {
+/* i = [0,127]
+ * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
+ 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465,
+ 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a,
+ 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6,
+ 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3,
+ 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b,
+ 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036,
+ 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01,
+ 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1,
+ 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb,
+ 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5,
+ 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405,
+ 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc,
+ 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7,
+ 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec,
+ 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b,
+ 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed,
+ 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150,
+ 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539,
+ 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66,
+ 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995,
+ 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d,
+ 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19,
+ 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404,
+ 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22,
+ 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47,
+ 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a,
+ 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06,
+ 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358,
+ 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20,
+ 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f,
+ 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197,
+ 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010,
+};
+
+static const unsigned long long LCONST[] = {
+0x3ff0000000000000ULL, /* DONE = 1.0 */
+0x4000000000000000ULL, /* DTWO = 2.0 */
+0x4230000000000000ULL, /* D2ON36 = 2**36 */
+0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */
+0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */
+};
+
+#define RET_SC(I) \
+ px += stridex; \
+ py += stridey; \
+ pz += stridez; \
+ if (--n <= 0) \
+ break; \
+ goto start##I;
+
+#define RETURN(I, ret) \
+{ \
+ pz[0] = (ret); \
+ RET_SC(I) \
+}
+
+#define PREP(I) \
+hx##I = HI(px); \
+hy##I = HI(py); \
+hx##I &= 0x7fffffff; \
+hy##I &= 0x7fffffff; \
+pz##I = pz; \
+if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000) /* |X| or |Y| = Inf,NaN */ \
+{ \
+ lx = LO(px); \
+ ly = LO(py); \
+ x = *px; \
+ y = *py; \
+ if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0; /* |X| = Inf */ \
+ else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0; /* |Y| = Inf */ \
+ else res0 = fabs(x) + fabs(y); \
+ \
+ RETURN (I, res0) \
+} \
+x##I = *px; \
+y##I = *py; \
+diff0 = hy##I - hx##I; \
+j0 = diff0 >> 31; \
+if (hx##I < 0x00100000 && hy##I < 0x00100000) /* |X| and |Y| = subnormal or zero */ \
+{ \
+ lx = LO(px); \
+ ly = LO(py); \
+ x = x##I; \
+ y = y##I; \
+ \
+ if ((hx##I | hy##I | lx | ly) == 0) /* |X| and |Y| = 0 */ \
+ RETURN (I, DONE / 0.0) \
+ \
+ x = fabs(x); \
+ y = fabs(y); \
+ \
+ x = *(long long*)&x; \
+ y = *(long long*)&y; \
+ \
+ x *= D2ONM52; \
+ y *= D2ONM52; \
+ \
+ x_hi0 = (x + D2ON36) - D2ON36; \
+ y_hi0 = (y + D2ON36) - D2ON36; \
+ x_lo0 = x - x_hi0; \
+ y_lo0 = y - y_hi0; \
+ res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \
+ res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \
+ \
+ dres0 = res0_hi + res0_lo; \
+ \
+ iarr0 = HI(&dres0); \
+ iexp0 = iarr0 & 0xfff00000; \
+ \
+ iarr0 = (iarr0 >> 11) & 0x1fc; \
+ itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \
+ itbl0 -= iexp0; \
+ HI(&dd0) = itbl0; \
+ LO(&dd0) = 0; \
+ \
+ dd0 = dd0 * (DTWO - dd0 * dres0); \
+ dd0 = dd0 * (DTWO - dd0 * dres0); \
+ dres0 = dd0 * (DTWO - dd0 * dres0); \
+ \
+ HI(&res0) = HI(&dres0) & 0xffffff00; \
+ LO(&res0) = 0; \
+ res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \
+ res0 = sqrt (res0); \
+ \
+ res0 = D2ON1022 * res0; \
+ RETURN (I, res0) \
+} \
+j0 = hy##I - (diff0 & j0); \
+j0 &= 0x7ff00000; \
+HI(&scl##I) = 0x7ff00000 - j0;
+
+void
+__vrhypot(int n, double * restrict px, int stridex, double * restrict py,
+ int stridey, double * restrict pz, int stridez)
+{
+ int i = 0;
+ double x, y;
+ double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
+ double x0, y0, res0, dd0;
+ double res0_hi,res0_lo, dres0;
+ double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
+ double x1 = 0.0L, y1 = 0.0L, res1, dd1;
+ double res1_hi,res1_lo, dres1;
+ double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
+ double x2, y2, res2, dd2;
+ double res2_hi,res2_lo, dres2;
+
+ int hx0, hy0, j0, diff0;
+ int iarr0, iexp0, itbl0;
+ int hx1, hy1;
+ int iarr1, iexp1, itbl1;
+ int hx2, hy2;
+ int iarr2, iexp2, itbl2;
+
+ int lx, ly;
+
+ double DONE = ((double*)LCONST)[0];
+ double DTWO = ((double*)LCONST)[1];
+ double D2ON36 = ((double*)LCONST)[2];
+ double D2ON1022 = ((double*)LCONST)[3];
+ double D2ONM52 = ((double*)LCONST)[4];
+
+ double *pz0, *pz1 = 0, *pz2;
+
+ do
+ {
+start0:
+ PREP(0)
+ px += stridex;
+ py += stridey;
+ pz += stridez;
+ i = 1;
+ if (--n <= 0)
+ break;
+
+start1:
+ PREP(1)
+ px += stridex;
+ py += stridey;
+ pz += stridez;
+ i = 2;
+ if (--n <= 0)
+ break;
+
+start2:
+ PREP(2)
+
+ x0 *= scl0;
+ y0 *= scl0;
+ x1 *= scl1;
+ y1 *= scl1;
+ x2 *= scl2;
+ y2 *= scl2;
+
+ x_hi0 = (x0 + D2ON36) - D2ON36;
+ y_hi0 = (y0 + D2ON36) - D2ON36;
+ x_hi1 = (x1 + D2ON36) - D2ON36;
+ y_hi1 = (y1 + D2ON36) - D2ON36;
+ x_hi2 = (x2 + D2ON36) - D2ON36;
+ y_hi2 = (y2 + D2ON36) - D2ON36;
+ x_lo0 = x0 - x_hi0;
+ y_lo0 = y0 - y_hi0;
+ x_lo1 = x1 - x_hi1;
+ y_lo1 = y1 - y_hi1;
+ x_lo2 = x2 - x_hi2;
+ y_lo2 = y2 - y_hi2;
+ res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
+ res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
+ res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
+ res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
+ res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
+ res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
+
+ dres0 = res0_hi + res0_lo;
+ dres1 = res1_hi + res1_lo;
+ dres2 = res2_hi + res2_lo;
+
+ iarr0 = HI(&dres0);
+ iarr1 = HI(&dres1);
+ iarr2 = HI(&dres2);
+ iexp0 = iarr0 & 0xfff00000;
+ iexp1 = iarr1 & 0xfff00000;
+ iexp2 = iarr2 & 0xfff00000;
+
+ iarr0 = (iarr0 >> 11) & 0x1fc;
+ iarr1 = (iarr1 >> 11) & 0x1fc;
+ iarr2 = (iarr2 >> 11) & 0x1fc;
+ itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
+ itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
+ itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
+ itbl0 -= iexp0;
+ itbl1 -= iexp1;
+ itbl2 -= iexp2;
+ HI(&dd0) = itbl0;
+ HI(&dd1) = itbl1;
+ HI(&dd2) = itbl2;
+ LO(&dd0) = 0;
+ LO(&dd1) = 0;
+ LO(&dd2) = 0;
+
+ dd0 = dd0 * (DTWO - dd0 * dres0);
+ dd1 = dd1 * (DTWO - dd1 * dres1);
+ dd2 = dd2 * (DTWO - dd2 * dres2);
+ dd0 = dd0 * (DTWO - dd0 * dres0);
+ dd1 = dd1 * (DTWO - dd1 * dres1);
+ dd2 = dd2 * (DTWO - dd2 * dres2);
+ dres0 = dd0 * (DTWO - dd0 * dres0);
+ dres1 = dd1 * (DTWO - dd1 * dres1);
+ dres2 = dd2 * (DTWO - dd2 * dres2);
+
+ HI(&res0) = HI(&dres0) & 0xffffff00;
+ HI(&res1) = HI(&dres1) & 0xffffff00;
+ HI(&res2) = HI(&dres2) & 0xffffff00;
+ LO(&res0) = 0;
+ LO(&res1) = 0;
+ LO(&res2) = 0;
+ res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
+ res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
+ res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
+ res0 = sqrt (res0);
+ res1 = sqrt (res1);
+ res2 = sqrt (res2);
+
+ res0 = scl0 * res0;
+ res1 = scl1 * res1;
+ res2 = scl2 * res2;
+
+ *pz0 = res0;
+ *pz1 = res1;
+ *pz2 = res2;
+
+ px += stridex;
+ py += stridey;
+ pz += stridez;
+ i = 0;
+
+ } while (--n > 0);
+
+ if (i > 0)
+ {
+ x0 *= scl0;
+ y0 *= scl0;
+
+ x_hi0 = (x0 + D2ON36) - D2ON36;
+ y_hi0 = (y0 + D2ON36) - D2ON36;
+ x_lo0 = x0 - x_hi0;
+ y_lo0 = y0 - y_hi0;
+ res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
+ res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
+
+ dres0 = res0_hi + res0_lo;
+
+ iarr0 = HI(&dres0);
+ iexp0 = iarr0 & 0xfff00000;
+
+ iarr0 = (iarr0 >> 11) & 0x1fc;
+ itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
+ itbl0 -= iexp0;
+ HI(&dd0) = itbl0;
+ LO(&dd0) = 0;
+
+ dd0 = dd0 * (DTWO - dd0 * dres0);
+ dd0 = dd0 * (DTWO - dd0 * dres0);
+ dres0 = dd0 * (DTWO - dd0 * dres0);
+
+ HI(&res0) = HI(&dres0) & 0xffffff00;
+ LO(&res0) = 0;
+ res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
+ res0 = sqrt (res0);
+
+ res0 = scl0 * res0;
+
+ *pz0 = res0;
+
+ if (i > 1)
+ {
+ x1 *= scl1;
+ y1 *= scl1;
+
+ x_hi1 = (x1 + D2ON36) - D2ON36;
+ y_hi1 = (y1 + D2ON36) - D2ON36;
+ x_lo1 = x1 - x_hi1;
+ y_lo1 = y1 - y_hi1;
+ res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
+ res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
+
+ dres1 = res1_hi + res1_lo;
+
+ iarr1 = HI(&dres1);
+ iexp1 = iarr1 & 0xfff00000;
+
+ iarr1 = (iarr1 >> 11) & 0x1fc;
+ itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
+ itbl1 -= iexp1;
+ HI(&dd1) = itbl1;
+ LO(&dd1) = 0;
+
+ dd1 = dd1 * (DTWO - dd1 * dres1);
+ dd1 = dd1 * (DTWO - dd1 * dres1);
+ dres1 = dd1 * (DTWO - dd1 * dres1);
+
+ HI(&res1) = HI(&dres1) & 0xffffff00;
+ LO(&res1) = 0;
+ res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
+ res1 = sqrt (res1);
+
+ res1 = scl1 * res1;
+
+ *pz1 = res1;
+ }
+ }
+}
+