summaryrefslogtreecommitdiff
path: root/usr/src/lib/libmvec/common/vis/__vatan.S
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libmvec/common/vis/__vatan.S')
-rw-r--r--usr/src/lib/libmvec/common/vis/__vatan.S572
1 files changed, 572 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vatan.S b/usr/src/lib/libmvec/common/vis/__vatan.S
new file mode 100644
index 0000000000..b5b7b1d8d1
--- /dev/null
+++ b/usr/src/lib/libmvec/common/vis/__vatan.S
@@ -0,0 +1,572 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .file "__vatan.S"
+
+#include "libm.h"
+
+ RO_DATA
+
+! following is the C version of the ATAN algorithm
+! #include <math.h>
+! #include <stdio.h>
+! double jkatan(double *x)
+! {
+! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
+! int index, sign, intf, intz;
+! extern const double __vlibm_TBL_atan1[];
+! long *pf = (long *) &f, *pz = (long *) &z;
+!
+! /* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
+! * Error = -3.08254E-18 On the interval |x| < 1/64 */
+!
+! /* define dummy names for readability. Use parray to help compiler optimize loads */
+! #define p3 parray[0]
+! #define p2 parray[1]
+! #define p1 parray[2]
+! #define soffset 3
+!
+! static const double parray[] = {
+! -1.428029046844299722E-01, /* p[3] */
+! 1.999999917247000615E-01, /* p[2] */
+! -3.333333333329292858E-01, /* p[1] */
+! 1.0, /* not used for p[0], though */
+! -1.0, /* used to flip sign of answer */
+! };
+!
+! f = *x; /* fetch argument */
+! intf = pf[0]; /* grab upper half */
+! sign = intf & 0x80000000; /* sign of argument */
+! intf ^= sign; /* abs(upper argument) */
+! sign = (unsigned) sign >> 31; /* sign bit = 0 or 1 */
+! pf[0] = intf;
+!
+! if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
+! {
+! if( (intf > 0x7ff00000) ||
+! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
+! if( intf < 0x3e300000 ) /* avoid underflow for small arg */
+! {
+! dummy = 1.0e37 + f;
+! dummy = dummy;
+! return (*x);
+! }
+! if( intf > 0x43600000 ) /* avoid underflow for big arg */
+! {
+! index = 2;
+! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */
+! f = parray[soffset + sign] * f; /* put sign bit on ans */
+! return (f);
+! }
+! }
+!
+! index = 0; /* points to 0,0 in table */
+! if (intf > 0x40500000) /* if(|x| > 64 */
+! { f = -1.0/f;
+! index = 2; /* point to pi/2 upper, lower */
+! }
+! else if( intf >= 0x3f900000 ) /* if |x| >= (1/64)... */
+! {
+! intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */
+! pz[0] = intz; /* store as a double (z) */
+! pz[1] = 0; /* ...lower */
+! f = (f - z)/(1.0 + f*z); /* get reduced argument */
+! index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */
+! index += 4; /* skip over 0,0,pi/2,pi/2 */
+! }
+! conup = __vlibm_TBL_atan1[index]; /* upper table */
+! conlo = __vlibm_TBL_atan1[index+1]; /* lower table */
+! tmp = f*f;
+! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1);
+! ansu = conup + f; /* compute atan(f) upper */
+! ansl = (((conup - ansu) + f) + poly) + conlo;
+! ans = ansu + ansl;
+! ans = parray[soffset + sign] * ans;
+! return ans;
+! }
+
+/* 8 bytes = 1 double f.p. word */
+#define WSIZE 8
+
+ .align 32 !align with full D-cache line
+.COEFFS:
+ .double 0r-1.428029046844299722E-01 !p[3]
+ .double 0r1.999999917247000615E-01 !p[2]
+ .double 0r-3.333333333329292858E-01 !p[1]
+ .double 0r-1.0, !constant -1.0
+ .word 0x00008000,0x0 !for fp rounding of reduced arg
+ .word 0x7fff0000,0x0 !for fp truncation
+ .word 0x47900000,0 !a number close to 1.0E37
+ .word 0x80000000,0x0 !mask for fp sign bit
+ .word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument
+ .type .COEFFS,#object
+
+ ENTRY(__vatan)
+ save %sp,-SA(MINFRAME)-16,%sp
+ PIC_SETUP(g5)
+ PIC_SET(g5,__vlibm_TBL_atan1,o4)
+ PIC_SET(g5,.COEFFS,o0)
+/*
+ __vatan(int n, double *x, int stridex, double *y, stridey)
+ computes y(i) = atan( x(i) ), for 1=1,n. Stridex, stridey
+ are the distance between x and y elements
+
+ %i0 n
+ %i1 address of x
+ %i2 stride x
+ %i3 address of y
+ %i4 stride y
+*/
+ cmp %i0,0 !if n <=0,
+ ble,pn %icc,.RETURN !....then do nothing
+ sll %i2,3,%i2 !convert stride to byte count
+ sll %i4,3,%i4 !convert stride to byte count
+
+/* pre-load constants before beginning main loop */
+
+ ldd [%o0],%f58 !load p[3]
+ mov 2,%i5 !argcount = 3
+
+ ldd [%o0+WSIZE],%f60 !load p[2]
+ add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy
+ fzero %f18 !ansu1 = 0
+
+ ldd [%o0+2*WSIZE],%f62 !load p[1]
+ add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy
+ fzero %f12 !(poly1) = 0
+
+ ldd [%o0+3*WSIZE],%f56 !-1.0
+ fzero %f14 !tmp1 = 0
+
+ ldd [%o0+4*WSIZE],%f52 !load rounding mask
+ fzero %f16 !conup1 = 0
+
+ ldd [%o0+5*WSIZE],%f54 !load truncation mask
+ fzero %f36 !f1 = 0
+
+ ldd [%o0+6*WSIZE],%f50 !1.0e37
+ fzero %f38 !f2 = 0
+
+ ldd [%o0+7*WSIZE],%f32 !mask for sign bit
+
+ ldd [%o4+2*WSIZE],%f46 !pi/2 upper
+ ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower
+ sethi %hi(0x40500000),%l6 !64.0
+ sethi %hi(0x3f900000),%l7 !1/64.0
+ mov 0,%l4 !index1 = 0
+ mov 0,%l5 !index2 = 0
+
+.MAINLOOP:
+
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+.LOOP0:
+ deccc %i0 !--n
+ bneg 1f
+ mov %i1,%o5 !xuse = x (delay slot)
+
+ ba 2f
+ nop !delay slot
+1:
+ PIC_SET(g5,.COEFFS+8*WSIZE,o5)
+ dec %i5 !argcount--
+2:
+ sethi %hi(0x80000000),%o7 !mask for sign bit
+/*2 */ sethi %hi(0x43600000),%o1 !big = 0x43600000,0
+ ld [%o5],%o0 !intf = pf[0] = f upper
+ ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2]
+
+ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
+/*4 */ andn %o0,%o7,%o0 !intf = fabs(intf)
+ ldd [%o5],%f34 !f = *x into f34
+
+ sub %o1,%o0,%o1 !(-) if intf > big
+/*6 */ sub %o0,%o2,%o2 !(-) if intf < small
+ fand %f34,%f32,%f40 !sign0 = sign bit
+ fmuld %f38,%f38,%f24 !tmp2= f2*f2
+
+/*7 */ orcc %o1,%o2,%g0 !(-) if either true
+ bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below
+ fabsd %f34,%f34 !abs(f) (delay slot)
+ !----------------------
+
+
+ sethi %hi(0x8000),%o7 !rounding bit
+/*8 */ fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again)
+ faddd %f26,%f38,%f28 !ansu2 = conup2 + f2
+
+ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
+/*9*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
+ fmuld %f58,%f24,%f22 !p[3]*tmp2
+
+/*10 */ sethi %hi(0x7fff0000),%o7 !mask for rounding argument
+ fmuld %f34,%f0,%f10 !f*z
+ fsubd %f34,%f0,%f20 !f - z
+ add %o4,%l4,%l4 !base addr + index1
+ fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
+ faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1
+
+/*12 */ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
+ faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2]
+ ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1]
+
+/*13 */ sub %o0,%l7,%o2 !intz - 0x3f900000
+ fsubd %f10,%f56,%f10 !(f*z - (-1.0))
+ faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1
+
+ cmp %o0,%l6 !(|f| > 64)
+ ble .ELSE0 !if(|f| > 64) then
+/*15 */ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
+ mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
+ ba .ENDIF0 !continue
+/*16 */ fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot)
+ .ELSE0: !else f( |x| >= (1/64))
+ cmp %o0,%l7 !if intf >= 1/64
+ bl .ENDIF0 !if( |x| >= (1/64) ) then...
+ mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
+ add %o3,4,%o1 !index = index + 4
+/*16 */ fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument
+ .ENDIF0:
+
+/*17*/ sll %o1,3,%l3 !index0 = index
+ mov %i3,%l0 !yaddr0 = address of y
+ faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
+ fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2
+ fsubd %f26,%f28,%f26 !conup2 - ansu2
+
+/*20*/ add %i1,%i2,%i1 !x += stridex
+ add %i3,%i4,%i3 !y += stridey
+ faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1
+ fmuld %f38,%f24,%f24 !f*tmp2
+ faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1
+
+/*23*/ for %f36,%f42,%f36 !sign(ans1) = sign of argument
+ std %f36,[%l1] !*yaddr1 = ans1
+ add %o4,%l5,%l5 !base addr + index2
+ fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
+ faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2
+ cmp %i5,0 !if argcount =0, we are done
+ be .RETURN
+ nop
+
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+.LOOP1:
+/*25*/ deccc %i0 !--n
+ bneg 1f
+ mov %i1,%o5 !xuse = x (delay slot)
+ ba 2f
+ nop !delay slot
+1:
+ PIC_SET(g5,.COEFFS+8*WSIZE,o5)
+ dec %i5 !argcount--
+2:
+
+/*26*/ sethi %hi(0x80000000),%o7 !mask for sign bit
+ sethi %hi(0x43600000),%o1 !big = 0x43600000,0
+ ld [%o5],%o0 !intf = pf[0] = f upper
+
+/*28*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
+ andn %o0,%o7,%o0 !intf = fabs(intf)
+ ldd [%o5],%f36 !f = *x into f36
+
+/*30*/ sub %o1,%o0,%o1 !(-) if intf > big
+ sub %o0,%o2,%o2 !(-) if intf < small
+ fand %f36,%f32,%f42 !sign1 = sign bit
+
+/*31*/ orcc %o1,%o2,%g0 !(-) if either true
+ bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below
+ fabsd %f36,%f36 !abs(f) (delay slot)
+ !----------------------
+
+/*32*/ fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again)
+ ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1]
+
+/*33*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
+ sethi %hi(0x8000),%o7 !rounding bit
+ faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2
+
+/*34*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
+ sethi %hi(0x7fff0000),%o7 !mask for rounding argument
+ fmuld %f36,%f0,%f10 !f*z
+ fsubd %f36,%f0,%f20 !f - z
+
+/*35*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
+ faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
+
+/*37*/ sub %o0,%l7,%o2 !intz - 0x3f900000
+ fsubd %f10,%f56,%f10 !(f*z - (-1.0))
+ ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0]
+
+ cmp %o0,%l6 !(|f| > 64)
+ ble .ELSE1 !if(|f| > 64) then
+/*38*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
+ mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
+ ba .ENDIF1 !continue
+/*40*/ fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot)
+ .ELSE1: !else f( |x| >= (1/64))
+ cmp %o0,%l7 !if intf >= 1/64
+ bl .ENDIF1 !if( |x| >= (1/64) ) then...
+ mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
+ add %o3,4,%o1 !index = index + 4
+/*40*/ fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument
+ .ENDIF1:
+
+/*41*/sll %o1,3,%l4 !index1 = index
+ mov %i3,%l1 !yaddr1 = address of y
+ fmuld %f34,%f34,%f4 !tmp0= f0*f0
+ faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2
+
+/*44*/add %i1,%i2,%i1 !x += stridex
+ add %i3,%i4,%i3 !y += stridey
+ fmuld %f58,%f4,%f2 !p[3]*tmp0
+ faddd %f6,%f34,%f8 !ansu0 = conup0 + f0
+ for %f38,%f44,%f38 !sign(ans2) = sign of argument
+ std %f38,[%l2] !*yaddr2 = ans2
+ cmp %i5,0 !if argcount =0, we are done
+ be .RETURN
+ nop
+
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+.LOOP2:
+/*46*/ deccc %i0 !--n
+ bneg 1f
+ mov %i1,%o5 !xuse = x (delay slot)
+ ba 2f
+ nop !delay slot
+1:
+ PIC_SET(g5,.COEFFS+8*WSIZE,o5)
+ dec %i5 !argcount--
+2:
+
+/*47*/ sethi %hi(0x80000000),%o7 !mask for sign bit
+ sethi %hi(0x43600000),%o1 !big = 0x43600000,0
+ ld [%o5],%o0 !intf = pf[0] = f upper
+
+/*49*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
+ andn %o0,%o7,%o0 !intf = fabs(intf)
+ ldd [%o5],%f38 !f = *x into f38
+
+/*51*/ sub %o1,%o0,%o1 !(-) if intf > big
+ sub %o0,%o2,%o2 !(-) if intf < small
+ fand %f38,%f32,%f44 !sign2 = sign bit
+
+/*52*/ orcc %o1,%o2,%g0 !(-) if either true
+ bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below
+ fabsd %f38,%f38 !abs(f) (delay slot)
+ !----------------------
+
+/*53*/ fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again)
+ faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2]
+
+/*54*/ sethi %hi(0x8000),%o7 !rounding bit
+ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
+
+/*55*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
+ sethi %hi(0x7fff0000),%o7 !mask for rounding argument
+ fmuld %f38,%f0,%f10 !f*z
+ fsubd %f38,%f0,%f20 !f - z
+
+/*56*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
+ fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0
+ fsubd %f6,%f8,%f6 !conup0 - ansu0
+
+/*58*/ sub %o0,%l7,%o2 !intz - 0x3f900000
+ fsubd %f10,%f56,%f10 !(f*z - (-1.0))
+ ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1]
+
+ cmp %o0,%l6 !(|f| > 64)
+ ble .ELSE2 !if(|f| > 64) then
+/*60*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
+ mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
+ ba .ENDIF2 !continue
+/*61*/ fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot)
+ .ELSE2: !else f( |x| >= (1/64))
+ cmp %o0,%l7 !if intf >= 1/64
+ bl .ENDIF2 !if( |x| >= (1/64) ) then...
+ mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
+ add %o3,4,%o1 !index = index + 4
+/*61*/ fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument
+ .ENDIF2:
+
+
+/*62*/ sll %o1,3,%l5 !index2 = index
+ mov %i3,%l2 !yaddr2 = address of y
+ fmuld %f34,%f4,%f4 !f0*tmp0
+ faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1
+ fmuld %f36,%f36,%f14 !tmp1= f1*f1
+
+/*65*/add %o4,%l3,%l3 !base addr + index0
+ fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
+ faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0
+ fmuld %f58,%f14,%f12 !p[3]*tmp1
+ faddd %f16,%f36,%f18 !ansu1 = conup1 + f1
+ ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1]
+
+/*68*/ add %i1,%i2,%i1 !x += stridex
+ add %i3,%i4,%i3 !y += stridey
+ faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0
+ faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2]
+
+/*71*/faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
+ fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1
+ fsubd %f16,%f18,%f16 !conup1 - ansu1
+
+/*74*/faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0
+ fmuld %f36,%f14,%f14 !f1*tmp1
+ faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1
+
+/*77*/ for %f34,%f40,%f34 !sign(ans0) = sign of argument
+ std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot)
+ cmp %i5,0 !if argcount =0, we are done
+ bg .MAINLOOP
+ nop
+
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+.RETURN:
+ ret
+ restore %g0,%g0,%g0
+
+ /*--------------------------------------------------------------------------*/
+ /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+/* at this point
+ %i1 x address
+ %o0 intf
+ %o2 intf - 0x3e300000
+ %f34,36,38 f0,f1,f2
+ %f40,42,44 sign0,sign1,sign2
+*/
+
+ .align 32 !align on I-cache boundary
+.SPECIAL0:
+ orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
+ bpos 1f !if >=...continue
+ sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
+ ba 3f
+ faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot)
+1:
+ ld [%o5+4],%o5 !load x lower word
+ sllx %o0,32,%o0 !left justify intf
+ sllx %g1,32,%g1 !left justify Inf
+ or %o0,%o5,%o0 !merge in lower intf
+ cmp %o0,%g1 !if intf > 0x7ff00000 00000000
+ ble,pt %xcc,2f !pass thru if NaN
+ nop
+ fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception
+ ba 3f
+ nop
+2:
+ faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower
+3:
+ add %i1,%i2,%i1 !x += stridex
+ for %f34,%f40,%f34 !sign(ans) = sign of argument
+ std %f34,[%i3] !*y = ans
+ ba .LOOP0 !keep looping
+ add %i3,%i4,%i3 !y += stridey (delay slot)
+
+ /*--------------------------------------------------------------------------*/
+ /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+ .align 32 !align on I-cache boundary
+.SPECIAL1:
+ orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
+ bpos 1f !if >=...continue
+ sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
+ ba 3f
+ faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot)
+1:
+ ld [%o5+4],%o5 !load x lower word
+ sllx %o0,32,%o0 !left justify intf
+ sllx %g1,32,%g1 !left justify Inf
+ or %o0,%o5,%o0 !merge in lower intf
+ cmp %o0,%g1 !if intf > 0x7ff00000 00000000
+ ble,pt %xcc,2f !pass thru if NaN
+ nop
+ fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception
+ ba 3f
+ nop
+2:
+ faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower
+3:
+ add %i1,%i2,%i1 !x += stridex
+ for %f36,%f42,%f36 !sign(ans) = sign of argument
+ std %f36,[%i3] !*y = ans
+ ba .LOOP1 !keep looping
+ add %i3,%i4,%i3 !y += stridey (delay slot)
+
+ /*--------------------------------------------------------------------------*/
+ /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+ .align 32 !align on I-cache boundary
+.SPECIAL2:
+ orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
+ bpos 1f !if >=...continue
+ sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
+ ba 3f
+ faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot)
+1:
+ ld [%o5+4],%o5 !load x lower word
+ sllx %o0,32,%o0 !left justify intf
+ sllx %g1,32,%g1 !left justify Inf
+ or %o0,%o5,%o0 !merge in lower intf
+ cmp %o0,%g1 !if intf > 0x7ff00000 00000000
+ ble,pt %xcc,2f !pass thru if NaN
+ nop
+ fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception
+ ba 3f
+ nop
+2:
+ faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower
+3:
+ add %i1,%i2,%i1 !x += stridex
+ for %f38,%f44,%f38 !sign(ans) = sign of argument
+ std %f38,[%i3] !*y = ans
+ ba .LOOP2 !keep looping
+ add %i3,%i4,%i3 !y += stridey
+
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+ /*--------------------------------------------------------------------------*/
+
+ SET_SIZE(__vatan)
+
+! .ident "03-20-96 Sparc V9 3-way-unrolled version"