1 files changed, 415 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/__vrsqrt.c b/usr/src/lib/libmvec/common/__vrsqrt.c
new file mode 100644
index 0000000000..6fb9cd7414
--- /dev/null
+++ b/usr/src/lib/libmvec/common/__vrsqrt.c
@@ -0,0 +1,415 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/isa_defs.h>
+#include "libm_synonyms.h"
+#include "libm_inlines.h"
+
+#ifdef _LITTLE_ENDIAN
+#define HI(x)	*(1+(int*)x)
+#define LO(x)	*(unsigned*)x
+#else
+#define HI(x)	*(int*)x
+#define LO(x)	*(1+(unsigned*)x)
+#endif
+
+#ifdef __RESTRICT
+#define restrict _Restrict
+#else
+#define restrict
+#endif
+
+/* double rsqrt(double x)
+ *
+ * Method :
+ *	1. Special cases:
+ *		for x = NaN				=> QNaN;
+ *		for x = +Inf				=> 0;
+ *		for x is negative, -Inf			=> QNaN + invalid;
+ *		for x = +0				=> +Inf + divide-by-zero;
+ *		for x = -0				=> -Inf + divide-by-zero.
+ *	2. Computes reciprocal square root from:
+ *		x = m * 2**n
+ *	Where:
+ *		m = [0.5, 2),
+ *		n = ((exponent + 1) & ~1).
+ *	Then:
+ *		rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
+ *	2. Computes 1/sqrt(m) from:
+ *		1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
+ *	Where:
+ *		m = m0 + dm,
+ *		m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
+ *		m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
+ *		m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
+ *	Then:
+ *		1/sqrt(m0) is looked up in a table,
+ *		1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
+ *		1/sqrt(1 + (1/m0)*dm) is computed using approximation:
+ *			1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
+ *						* z + a2) * z + a1) * z + a0
+ *			where z = [-1/128, 1/128].
+ *
+ * Accuracy:
+ *	The maximum relative error for the approximating
+ *	polynomial is 2**(-56.26).
+ *	Maximum error observed: less than 0.563 ulp after 1.500.000.000
+ *	results.
+ */
+
+#define sqrt __sqrt
+
+extern double sqrt (double);
+extern const double __vlibm_TBL_rsqrt[];
+
+static void
+__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
+
+#pragma no_inline(__vrsqrt_n)
+
+#define RETURN(ret)						\
+{								\
+	*py = (ret);						\
+	py += stridey;						\
+	if (n_n == 0)						\
+	{							\
+		spx = px; spy = py;				\
+		hx = HI(px);					\
+		continue;					\
+	}							\
+	n--;							\
+	break;							\
+}
+
+static const double
+	DONE = 1.0,
+	K1 = -5.00000000000005209867e-01,
+	K2 =  3.75000000000004884257e-01,
+	K3 = -3.12499999317136886551e-01,
+	K4 =  2.73437499359815081532e-01,
+	K5 = -2.46116125605037803130e-01,
+	K6 =  2.25606914648617522896e-01;
+
+void
+__vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
+{
+	double		*spx, *spy;
+	int		ax, lx, hx, n_n;
+	double		res;
+
+	while (n > 1)
+	{
+		n_n = 0;
+		spx = px;
+		spy = py;
+		hx = HI(px);
+		for (; n > 1 ; n--)
+		{
+			px += stridex;
+			if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
+			{
+				res = *(px - stridex);
+				RETURN (DONE / res)
+			}
+
+			py += stridey;
+
+			if (hx < 0x00100000)		/* X = denormal, zero or negative	*/
+			{
+				py -= stridey;
+				ax = hx & 0x7fffffff;
+				lx = LO((px - stridex));
+				res = *(px - stridex);
+
+				if ((ax | lx) == 0)	/* |X| = zero	*/
+				{
+					RETURN (DONE / res)
+				}
+				else if (hx >= 0)	/* X = denormal	*/
+				{
+					double		res_c0, dsqrt_exp0;
+					int		ind0, sqrt_exp0;
+					double		xx0, dexp_hi0, dexp_lo0;
+					int		hx0, resh0, res_ch0;
+
+					res = *(long long*)&res;
+
+					hx0 = HI(&res);
+					sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
+					ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
+
+					resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
+					res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
+					HI(&res) = resh0;
+					HI(&res_c0) = res_ch0;
+					LO(&res_c0) = 0;
+
+					dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
+					dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
+					xx0 = dexp_hi0 * dexp_hi0;
+					xx0 = (res - res_c0) * xx0;
+					res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
+
+					res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
+
+					HI(&dsqrt_exp0) = sqrt_exp0;
+					LO(&dsqrt_exp0) = 0;
+					res *= dsqrt_exp0;
+
+					RETURN (res)
+				}
+				else	/* X = negative	*/
+				{
+					RETURN (sqrt(res))
+				}
+			}
+			n_n++;
+			hx = HI(px);
+		}
+		if (n_n > 0)
+			__vrsqrt_n(n_n, spx, stridex, spy, stridey);
+	}
+	if (n > 0)
+	{
+		hx = HI(px);
+
+		if (hx >= 0x7ff00000)		/* X = NaN or Inf	*/
+		{
+			res = *px;
+			*py = DONE / res;
+		}
+		else if (hx < 0x00100000)	/* X = denormal, zero or negative	*/
+		{
+			ax = hx & 0x7fffffff;
+			lx = LO(px);
+			res = *px;
+
+			if ((ax | lx) == 0)	/* |X| = zero	*/
+			{
+				*py = DONE / res;
+			}
+			else if (hx >= 0)	/* X = denormal	*/
+			{
+				double		res_c0, dsqrt_exp0;
+				int		ind0, sqrt_exp0;
+				double		xx0, dexp_hi0, dexp_lo0;
+				int		hx0, resh0, res_ch0;
+
+				res = *(long long*)&res;
+
+				hx0 = HI(&res);
+				sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
+				ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
+
+				resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
+				res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
+				HI(&res) = resh0;
+				HI(&res_c0) = res_ch0;
+				LO(&res_c0) = 0;
+
+				dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
+				dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
+				xx0 = dexp_hi0 * dexp_hi0;
+				xx0 = (res - res_c0) * xx0;
+				res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
+
+				res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
+
+				HI(&dsqrt_exp0) = sqrt_exp0;
+				LO(&dsqrt_exp0) = 0;
+				res *= dsqrt_exp0;
+
+				*py = res;
+			}
+			else	/* X = negative	*/
+			{
+				*py = sqrt(res);
+			}
+		}
+		else
+		{
+			double		res_c0, dsqrt_exp0;
+			int		ind0, sqrt_exp0;
+			double		xx0, dexp_hi0, dexp_lo0;
+			int		resh0, res_ch0;
+
+			sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
+			ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
+
+			resh0 = (hx & 0x001fffff) | 0x3fe00000;
+			res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
+			HI(&res) = resh0;
+			LO(&res) = LO(px);
+			HI(&res_c0) = res_ch0;
+			LO(&res_c0) = 0;
+
+			dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
+			dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
+			xx0 = dexp_hi0 * dexp_hi0;
+			xx0 = (res - res_c0) * xx0;
+			res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
+
+			res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
+
+			HI(&dsqrt_exp0) = sqrt_exp0;
+			LO(&dsqrt_exp0) = 0;
+			res *= dsqrt_exp0;
+
+			*py = res;
+		}
+	}
+}
+
+static void
+__vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
+{
+	double		res0, res_c0, dsqrt_exp0;
+	double		res1, res_c1, dsqrt_exp1;
+	double		res2, res_c2, dsqrt_exp2;
+	int		ind0, sqrt_exp0;
+	int		ind1, sqrt_exp1;
+	int		ind2, sqrt_exp2;
+	double		xx0, dexp_hi0, dexp_lo0;
+	double		xx1, dexp_hi1, dexp_lo1;
+	double		xx2, dexp_hi2, dexp_lo2;
+	int		hx0, resh0, res_ch0;
+	int		hx1, resh1, res_ch1;
+	int		hx2, resh2, res_ch2;
+
+	LO(&dsqrt_exp0) = 0;
+	LO(&dsqrt_exp1) = 0;
+	LO(&dsqrt_exp2) = 0;
+	LO(&res_c0) = 0;
+	LO(&res_c1) = 0;
+	LO(&res_c2) = 0;
+
+	for(; n > 2 ; n -= 3)
+	{
+		hx0 = HI(px);
+		LO(&res0) = LO(px);
+		px += stridex;
+
+		hx1 = HI(px);
+		LO(&res1) = LO(px);
+		px += stridex;
+
+		hx2 = HI(px);
+		LO(&res2) = LO(px);
+		px += stridex;
+
+		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
+		sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
+		sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
+		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
+		ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
+		ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
+
+		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
+		resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
+		resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
+		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
+		res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
+		res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
+		HI(&res0) = resh0;
+		HI(&res1) = resh1;
+		HI(&res2) = resh2;
+		HI(&res_c0) = res_ch0;
+		HI(&res_c1) = res_ch1;
+		HI(&res_c2) = res_ch2;
+
+		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
+		dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
+		dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
+		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
+		dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
+		dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
+		xx0 = dexp_hi0 * dexp_hi0;
+		xx1 = dexp_hi1 * dexp_hi1;
+		xx2 = dexp_hi2 * dexp_hi2;
+		xx0 = (res0 - res_c0) * xx0;
+		xx1 = (res1 - res_c1) * xx1;
+		xx2 = (res2 - res_c2) * xx2;
+		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
+		res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
+		res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
+
+		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
+		res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
+		res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
+
+		HI(&dsqrt_exp0) = sqrt_exp0;
+		HI(&dsqrt_exp1) = sqrt_exp1;
+		HI(&dsqrt_exp2) = sqrt_exp2;
+		res0 *= dsqrt_exp0;
+		res1 *= dsqrt_exp1;
+		res2 *= dsqrt_exp2;
+
+		*py = res0;
+		py += stridey;
+
+		*py = res1;
+		py += stridey;
+
+		*py = res2;
+		py += stridey;
+	}
+
+	for(; n > 0 ; n--)
+	{
+		hx0 = HI(px);
+
+		sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
+		ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
+
+		resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
+		res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
+		HI(&res0) = resh0;
+		LO(&res0) = LO(px);
+		HI(&res_c0) = res_ch0;
+		LO(&res_c0) = 0;
+
+		px += stridex;
+
+		dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
+		dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
+		xx0 = dexp_hi0 * dexp_hi0;
+		xx0 = (res0 - res_c0) * xx0;
+		res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
+
+		res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
+
+		HI(&dsqrt_exp0) = sqrt_exp0;
+		LO(&dsqrt_exp0) = 0;
+		res0 *= dsqrt_exp0;
+
+		*py = res0;
+		py += stridey;
+	}
+}
+